Qwen2.5-VL-7B-Instruct

Runtime error

File size: 5,387 Bytes

09dd649
 
 
 
 
 
 
80fa1bb
 
 
09dd649
80fa1bb
09dd649
 
 
 
 
 
 
80fa1bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09dd649
 
 
 
 
80fa1bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09dd649
80fa1bb
09dd649
80fa1bb
09dd649
 
80fa1bb
09dd649
 
80fa1bb
09dd649
 
 
 
 
 
 
 
 
 
80fa1bb
09dd649
 
 
 
 
 
 
 
80fa1bb
09dd649
 
 
80fa1bb
09dd649
 
 
80fa1bb
09dd649
80fa1bb
09dd649
 
 
 
 
80fa1bb
09dd649
80fa1bb
 
 
 
 
 
09dd649
 
 
 
80fa1bb
09dd649
80fa1bb
 
09dd649
 
 
 
80fa1bb

import gradio as gr
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
from transformers.image_utils import load_image
from threading import Thread
import time
import torch
import spaces
import cv2
from pathlib import Path
from PIL import Image

MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"  # или "Qwen/Qwen2.5-VL-3B-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
).to("cuda").eval()

# Функция для извлечения нескольких кадров из видео
def extract_frames(video_path, interval=2.0):
    """
    Извлекает кадры из видео через каждые `interval` секунд.
    """
    cap = cv2.VideoCapture(video_path)
    frames = []
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        fps = 25  # запасное значение
    frame_interval = int(fps * interval)
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(frame))
        frame_count += 1
    cap.release()
    return frames

@spaces.GPU
def model_inference(input_dict, history):
    text = input_dict["text"]
    files = input_dict["files"]

    images = []
    video_extensions = [".mp4", ".avi", ".mov", ".mkv"]
    
    if files:
        for file in files:
            ext = Path(file).suffix.lower()
            if ext in video_extensions:
                try:
                    # Извлекаем несколько кадров из видео
                    frames = extract_frames(file, interval=2.0)
                    if frames:
                        # Можно передать все извлечённые кадры
                        images.extend(frames)
                    else:
                        gr.Error("Не удалось извлечь кадры из видео.")
                        return
                except Exception as e:
                    gr.Error(f"Ошибка при обработке видеофайла: {e}")
                    return
            else:
                images.append(load_image(file))

    # Проверка входных данных
    if text == "" and not images:
        gr.Error("Пожалуйста, введите запрос и, опционально, прикрепите изображение/видео.")
        return
    if text == "" and images:
        gr.Error("Пожалуйста, введите текстовый запрос вместе с изображением/видео.")
        return

    # Подготовка сообщений для модели
    messages = [
        {
            "role": "user",
            "content": [
                *[{"type": "image", "image": image} for image in images],
                {"type": "text", "text": text},
            ],
        }
    ]

    # Применяем шаблон чата и подготавливаем входные данные
    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt],
        images=images if images else None,
        return_tensors="pt",
        padding=True,
    ).to("cuda")

    # Настраиваем стриминг вывода в реальном времени
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)

    # Запускаем генерацию в отдельном потоке
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Стримим вывод
    buffer = ""
    yield "Думаю..."
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)
        yield buffer

# Примеры входных данных
examples = [
    [{"text": "Опиши документ?", "files": ["example_images/document.jpg"]}],
    [{"text": "Что написано на изображении?", "files": ["example_images/math.jpg"]}],
    [{"text": "О чем этот UI?", "files": ["example_images/s2w_example.png"]}],
    [{"text": "Где происходят сильные засухи по диаграмме?", "files": ["example_images/examples_weather_events.png"]}],
    # Пример с видео (убедитесь, что файл существует)
    # [{"text": "Найди нужный объект в видео.", "files": ["example_videos/sample.mp4"]}],
]

demo = gr.ChatInterface(
    fn=model_inference,
    description="# **Qwen2.5-VL-7B-Instruct**\nТеперь можно анализировать и видео, извлекая несколько кадров.",
    examples=examples,
    textbox=gr.MultimodalTextbox(label="Запрос (текст + изображение/видео)", file_types=["image", "video"], file_count="multiple"),
    stop_btn="Остановить генерацию",
    multimodal=True,
    cache_examples=False,
)

demo.launch(debug=True)