Qwen2.5-VL
Collection
4 items • Updated
How to use ig1/Qwen2.5-VL-7B-Instruct-NVFP4 with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("image-text-to-text", model="ig1/Qwen2.5-VL-7B-Instruct-NVFP4")
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
{"type": "text", "text": "What animal is on the candy?"}
]
},
]
pipe(text=messages) # Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText
processor = AutoProcessor.from_pretrained("ig1/Qwen2.5-VL-7B-Instruct-NVFP4")
model = AutoModelForImageTextToText.from_pretrained("ig1/Qwen2.5-VL-7B-Instruct-NVFP4")
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
{"type": "text", "text": "What animal is on the candy?"}
]
},
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))How to use ig1/Qwen2.5-VL-7B-Instruct-NVFP4 with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "ig1/Qwen2.5-VL-7B-Instruct-NVFP4"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "ig1/Qwen2.5-VL-7B-Instruct-NVFP4",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in one sentence."
},
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
}
}
]
}
]
}'docker model run hf.co/ig1/Qwen2.5-VL-7B-Instruct-NVFP4
How to use ig1/Qwen2.5-VL-7B-Instruct-NVFP4 with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "ig1/Qwen2.5-VL-7B-Instruct-NVFP4" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "ig1/Qwen2.5-VL-7B-Instruct-NVFP4",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in one sentence."
},
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
}
}
]
}
]
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "ig1/Qwen2.5-VL-7B-Instruct-NVFP4" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "ig1/Qwen2.5-VL-7B-Instruct-NVFP4",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in one sentence."
},
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
}
}
]
}
]
}'How to use ig1/Qwen2.5-VL-7B-Instruct-NVFP4 with Docker Model Runner:
docker model run hf.co/ig1/Qwen2.5-VL-7B-Instruct-NVFP4
Quantization script:
# Create a dedicated python env
python3 -m venv llmcompressor
source llmcompressor/bin/activate
# Install llm-compressor and additionnal needed libs
pip install llmcompressor qwen_vl_utils torchvision
# Download model in HF cache
hf download Qwen/Qwen2.5-VL-7B-Instruct
# Prepare quantization script
## Download the GPTQ (INT4) as it is the closest to what we need to acheive (it includes the calibration phase)
wget https://github.com/vllm-project/llm-compressor/raw/refs/tags/0.8.1/examples/multimodal_vision/qwen_2_5_vl_example.py -O qwen_2_5_vl_gptq.py
## Create the patch file for NVFP4
cat << EOF > nvfp4.patch
--- qwen_2_5_vl_gptq.py 2025-10-20 13:34:15.446886854 +0200
+++ qwen_2_5_vl_fp4.py 2025-10-19 17:44:04.932080648 +0200
@@ -7,7 +7,7 @@
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.utils import dispatch_for_generation
# Load model.
@@ -69,13 +69,11 @@
# Recipe
-recipe = [
- GPTQModifier(
- targets="Linear",
- scheme="W4A16",
- ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
- ),
-]
+recipe = QuantizationModifier(
+ targets="Linear",
+ scheme="NVFP4",
+ ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
+)
# Perform oneshot
oneshot(
@@ -122,6 +120,6 @@
# Save to disk compressed.
-SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
EOF
## Apply the patch
patch qwen_2_5_vl_gptq.py -i nvfp4.patch -o qwen_2_5_vl_nvfp4.py
# Start the quantization
python3 qwen_2_5_vl_nvfp4.py
Base model
Qwen/Qwen2.5-VL-7B-Instruct