VLM deploy on 4060

# Create python env
mamba create -n vlm python=3.11
mamba activate vlm
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
pip install transformers accelerate safetensors bitsandbytes
pip install pillow opencv-python
pip install huggingface_hub loguru

Qwen3-VL

Env

# Download model
sudo apt update
sudo apt install git-lfs
git lfs install
git clone https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct-FP8 ~/models/Qwen3-VL-8B-Instruct-FP8
git clone https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct ~/models/Qwen3-VL-2B-Instruct
git clone https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-FP8 ~/models/Qwen3-VL-2B-Instruct-FP8

Test

qwen_test.py
import os
import cv2
import time
from loguru import logger

from transformers import AutoProcessor
from transformers.models.qwen3_vl import Qwen3VLForConditionalGeneration

model_name = os.path.expanduser("~/models/Qwen3-VL-8B-Instruct-FP8")

device_map = {"vision_tower": "cuda", "language_model": "cpu", "lm_head": "cuda"}

model = Qwen3VLForConditionalGeneration.from_pretrained(
    model_name, device_map="auto", torch_dtype="auto", offload_folder="offload"
)

t_start = time.time()
processor = AutoProcessor.from_pretrained(model_name)
logger.info("模型加载完成")

image_path = "test.jpg"
img_bgr = cv2.imread(image_path)
if img_bgr is None:
    raise FileNotFoundError(f"未找到图片: {image_path}")
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img_rgb},
            {"type": "text", "text": "请描述这张图片中的场景。"},
        ],
    }
]


text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
inputs = processor(text=[text], images=[img_rgb], return_tensors="pt").to(model.device)

# 推理
output = model.generate(**inputs, max_new_tokens=256)

# 解码
result = processor.batch_decode(output, skip_special_tokens=True)[0]
logger.info(f"推理结果: {result}")
logger.info(f"总耗时: {time.time() - t_start:.2f} 秒")

Server

qwen_server.py
import os
import cv2
import time
import base64
import numpy as np
from loguru import logger
import traceback
from typing import List, Optional, Union
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from contextlib import asynccontextmanager

from transformers import AutoProcessor
from transformers.models.qwen3_vl import Qwen3VLForConditionalGeneration
import torch

# 全局变量
model = None
processor = None
MODEL_NAME = "Qwen3-VL-2B-Instruct"

class Message(BaseModel):
    role: str
    content: Union[str, List[dict]]

class ChatCompletionRequest(BaseModel):
    model: str = MODEL_NAME
    messages: List[Message]
    max_tokens: Optional[int] = 256
    temperature: Optional[float] = 0.0
    stream: Optional[bool] = False

class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[dict]
    usage: dict

def load_model():
    """加载模型"""
    global model, processor
    logger.info(f"正在加载模型: {MODEL_NAME}")
    t_start = time.time()
    model_name = os.path.expanduser(f"~/models/{MODEL_NAME}")
    processor = AutoProcessor.from_pretrained(model_name)

    is_hopper = False
    if torch.cuda.is_available():
        name = torch.cuda.get_device_name(0)
        major, minor = torch.cuda.get_device_capability(0)
        sm = major * 10 + minor
        logger.info(f"检测到 GPU: {name}, Compute Capability: {major}.{minor}")
        is_hopper = sm >= 90
    else:
        logger.warning("未检测到可用 CUDA 设备,将在 CPU 上运行(不建议)")

    try:
        if (not is_hopper) and ("FP8" in MODEL_NAME):
            logger.warning("当前 GPU 非 Hopper, FP8 不受支持,回退为 bfloat16 运行。")
            model = Qwen3VLForConditionalGeneration.from_pretrained(
                model_name,
                device_map="auto",
                torch_dtype=torch.bfloat16,
            )
        else:
            model = Qwen3VLForConditionalGeneration.from_pretrained(
                model_name,
                device_map="auto",
                dtype="auto",
            )
    except NotImplementedError as e:
        logger.error(f"不支持的模型精度: {e}")
        logger.info(traceback.format_exc())
        raise e

    logger.info("模型加载完成")
    logger.info(f"模型加载时间: {time.time() - t_start:.2f} 秒")

def decode_image(image_data: str) -> np.ndarray:
    """解码 base64、URL 或本地图像"""
    if image_data.startswith("data:image"):
        base64_data = image_data.split(",")[1]
        img_bytes = base64.b64decode(base64_data)
        img_array = np.frombuffer(img_bytes, dtype=np.uint8)
        img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
    elif image_data.startswith("http"):
        import urllib.request
        resp = urllib.request.urlopen(image_data)
        img_array = np.asarray(bytearray(resp.read()), dtype=np.uint8)
        img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
    else:
        img_bgr = cv2.imread(image_data)
    if img_bgr is None:
        raise ValueError("无法解码图像")
    return cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

@asynccontextmanager
async def lifespan(app: FastAPI):
    load_model()
    yield
    logger.info("服务关闭")

app = FastAPI(title=MODEL_NAME, lifespan=lifespan)

@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
    """OpenAI 兼容接口"""
    try:
        t_start = time.time()
        messages = []
        images = []
        for msg in request.messages:
            if isinstance(msg.content, str):
                messages.append({
                    "role": msg.role,
                    "content": [{"type": "text", "text": msg.content}]
                })
                continue
            content = []
            for item in msg.content:
                if item.get("type") == "text":
                    content.append({"type": "text", "text": item["text"]})
                elif item.get("type") == "image_url":
                    img_rgb = decode_image(item["image_url"]["url"])
                    images.append(img_rgb)
                    content.append({"type": "image", "image": img_rgb})
                elif item.get("type") == "image":
                    img_rgb = item["image"]
                    images.append(img_rgb)
                    content.append({"type": "image", "image": img_rgb})
            messages.append({"role": msg.role, "content": content})
        chat_text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        if images:
            image_payload = [images] if len(images) > 1 else [images[0]]
            inputs = processor(
                text=[chat_text],
                images=image_payload,
                return_tensors="pt"
            )
        else:
            inputs = processor(text=[chat_text], return_tensors="pt")
        try:
            inputs = inputs.to(model.device)
        except AttributeError:
            inputs = {k: v.to("cuda") if torch.is_tensor(v) else v for k, v in inputs.items()}
        do_sample = request.temperature > 0
        output = model.generate(
            **inputs,
            do_sample=do_sample,
            temperature=request.temperature if do_sample else None,
            max_new_tokens=request.max_tokens
        )
        result = processor.batch_decode(output, skip_special_tokens=True)[0]
        inference_time = time.time() - t_start
        logger.info(f"推理耗时: {inference_time:.2f} 秒")
        completion_tokens = output.shape[-1] - inputs["input_ids"].shape[-1]
        return {
            "id": f"chatcmpl-{int(time.time())}",
            "object": "chat.completion",
            "created": int(time.time()),
            "model": request.model,
            "choices": [
                {
                    "index": 0,
                    "message": {"role": "assistant", "content": result},
                    "finish_reason": "stop"
                }
            ],
            "usage": {
                "prompt_tokens": inputs["input_ids"].shape[-1],
                "completion_tokens": completion_tokens,
                "total_tokens": inputs["input_ids"].shape[-1] + completion_tokens
            }
        }
    except Exception as exc:
        logger.error(f"推理错误: {exc}")
        raise HTTPException(status_code=500, detail=str(exc))

@app.get("/v1/models")
async def list_models():
    return {
        "object": "list",
        "data": [
            {
                "id": MODEL_NAME,
                "object": "model",
                "created": int(time.time()),
                "owned_by": "qwen"
            }
        ]
    }

@app.get("/health")
async def health_check():
    return {"status": "ok", "model_loaded": model is not None}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

Qwen3

ENV

git clone https://huggingface.co/Qwen/Qwen3-0.6B ~/models/Qwen3-0.6B

Test

qwen3_0.8b_test.py
import os
import time
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = os.path.expanduser("~/models/Qwen3-0.6B")

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

t_start = time.time()

# prepare the model input
prompt = "What's SLAM in robotics? Explain in detail."
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)
print(f"总耗时: {time.time() - t_start:.2f} 秒")

SmolVLM2 Instruct

https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct

ENV

pip install ninja cmake
pip install num2words
pip install flash-attn
git clone https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct ~/models/SmolVLM2-2.2B-Instruct
git clone https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct ~/models/SmolVLM2-500M-Video-Instruct

Test

smolVLM_test.py
import os
import cv2
import time
from loguru import logger

from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

t_start = time.time()
model_path = os.path.expanduser("~/models/SmolVLM2-2.2B-Instruct")
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2"
).to("cuda")
logger.info("模型加载完成")
logger.info(f"模型加载时间: {time.time() - t_start:.2f} 秒")

t_start = time.time()

image_path = "test.jpg"
img_bgr = cv2.imread(image_path)
if img_bgr is None:
    raise FileNotFoundError(f"未找到图片: {image_path}")
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img_rgb},
            {"type": "text", "text": "请描述这张图片中的场景。"},
        ]
    },
]

inputs = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device, dtype=torch.bfloat16)

generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

logger.info(f"Result: {generated_texts[0]}")
logger.info(f"总耗时: {time.time() - t_start:.2f} 秒")

Server

pip install fastapi uvicorn
smolVLM_server.py
import os
import cv2
import time
import base64
import numpy as np
from loguru import logger
from typing import List, Optional, Union
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from contextlib import asynccontextmanager

from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

# 全局变量存储模型
model = None
processor = None

@asynccontextmanager
async def lifespan(app: FastAPI):
    """应用生命周期管理"""
    # 启动时加载模型
    load_model()
    yield
    # 关闭时清理资源(如果需要)
    logger.info("服务关闭")

# 初始化 FastAPI
app = FastAPI(title="SmolVLM2-2.2B-Instruct", lifespan=lifespan)

class Message(BaseModel):
    role: str
    content: Union[str, List[dict]]

class ChatCompletionRequest(BaseModel):
    model: str = "SmolVLM2-2.2B-Instruct"
    messages: List[Message]
    max_tokens: Optional[int] = 64
    temperature: Optional[float] = 0.0
    stream: Optional[bool] = False

class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[dict]
    usage: dict

def load_model():
    """加载模型"""
    global model, processor
    t_start = time.time()
    model_path = os.path.expanduser("~/models/SmolVLM2-2.2B-Instruct")
    processor = AutoProcessor.from_pretrained(model_path)
    model = AutoModelForImageTextToText.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        _attn_implementation="flash_attention_2"
    ).to("cuda")
    logger.info("模型加载完成")
    logger.info(f"模型加载时间: {time.time() - t_start:.2f} 秒")

def decode_image(image_data: str) -> np.ndarray:
    """解码 base64 或 URL 图像"""
    if image_data.startswith("data:image"):
        # Base64 编码的图像
        base64_data = image_data.split(",")[1]
        img_bytes = base64.b64decode(base64_data)
        img_array = np.frombuffer(img_bytes, dtype=np.uint8)
        img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
    elif image_data.startswith("http"):
        # URL 图像(需要下载)
        import urllib.request
        resp = urllib.request.urlopen(image_data)
        img_array = np.asarray(bytearray(resp.read()), dtype=np.uint8)
        img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
    else:
        # 假设是文件路径
        img_bgr = cv2.imread(image_data)
    
    if img_bgr is None:
        raise ValueError("无法解码图像")
    
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    return img_rgb

@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
    """OpenAI 兼容的聊天补全接口"""
    try:
        t_start = time.time()
        
        # 转换消息格式
        messages = []
        for msg in request.messages:
            if isinstance(msg.content, str):
                # 纯文本消息
                messages.append({
                    "role": msg.role,
                    "content": [{"type": "text", "text": msg.content}]
                })
            else:
                # 多模态消息
                content = []
                for item in msg.content:
                    if item.get("type") == "text":
                        content.append({"type": "text", "text": item["text"]})
                    elif item.get("type") == "image_url":
                        img_url = item["image_url"]["url"]
                        img_rgb = decode_image(img_url)
                        content.append({"type": "image", "image": img_rgb})
                    elif item.get("type") == "image":
                        # 直接传递图像数据
                        content.append(item)
                
                messages.append({
                    "role": msg.role,
                    "content": content
                })
        
        # 推理
        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=False,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(model.device, dtype=torch.bfloat16)

        do_sample = request.temperature > 0
        generated_ids = model.generate(
            **inputs, 
            do_sample=do_sample, 
            max_new_tokens=request.max_tokens
        )
        generated_texts = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True,
        )
        
        inference_time = time.time() - t_start
        logger.info(f"推理耗时: {inference_time:.2f} 秒")
        
        # response = {
        #     "id": f"chatcmpl-{int(time.time())}",
        #     "object": "chat.completion",
        #     "created": int(time.time()),
        #     "model": request.model,
        #     "choices": [
        #         {
        #             "index": 0,
        #             "message": {
        #                 "role": "assistant",
        #                 "content": generated_texts[0]
        #             },
        #             "finish_reason": "stop"
        #         }
        #     ],
        #     "usage": {
        #         "prompt_tokens": inputs["input_ids"].shape[1],
        #         "completion_tokens": len(generated_ids[0]) - inputs["input_ids"].shape[1],
        #         "total_tokens": len(generated_ids[0])
        #     }
        # }

        response = {
            "content": generated_texts[0]
        }
        
        return response
        
    except Exception as e:
        logger.error(f"推理错误: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/v1/models")
async def list_models():
    """列出可用模型"""
    return {
        "object": "list",
        "data": [
            {
                "id": "SmolVLM2-2.2B-Instruct",
                "object": "model",
                "created": int(time.time()),
                "owned_by": "huggingface"
            }
        ]
    }

@app.get("/health")
async def health_check():
    """健康检查"""
    return {"status": "ok", "model_loaded": model is not None}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
smolVLM_client.py
import requests
import base64

# 读取图像并转换为 base64
with open("test.jpg", "rb") as f:
    img_base64 = base64.b64encode(f.read()).decode()

# 构造请求
url = "http://localhost:8000/v1/chat/completions"
data = {
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{img_base64}"
                    }
                },
                {
                    "type": "text",
                    "text": "Describe the image in a short sentence and start with 'FINAL_ANSWER:'."
                }
            ]
        }
    ],
    "max_tokens": 128
}

response = requests.post(url, json=data)
result = response.json()["content"]
final_answer = result.split("FINAL_ANSWER:")[-1].strip()
print(final_answer)