import gradio as gr
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
from transformers.image_utils import load_image
from threading import Thread
import time
import torch
import spaces
import cv2
from pathlib import Path
from PIL import Image
import concurrent.futures

MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"  # или "Qwen/Qwen2.5-VL-3B-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
).to("cuda").eval()

def extract_frame_at(video_path, frame_index):
    """
    Извлекает кадр по указанному индексу.
    """
    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
    ret, frame = cap.read()
    cap.release()
    if ret:
        # Преобразуем BGR в RGB и возвращаем как PIL Image
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        return Image.fromarray(frame)
    else:
        return None

def extract_frames_parallel(video_path, interval=2.0):
    """
    Извлекает кадры из видео с интервалом в секундах, выполняя запросы параллельно.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        fps = 25  # запасное значение
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()

    frame_interval = int(fps * interval)
    # Вычисляем номера кадров для извлечения
    frame_indices = list(range(0, total_frames, frame_interval))
    
    frames = []
    # Параллельное извлечение кадров
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        results = executor.map(lambda idx: extract_frame_at(video_path, idx), frame_indices)
        for frame in results:
            if frame is not None:
                frames.append(frame)
    return frames

@spaces.GPU
def model_inference(input_dict, history):
    text = input_dict["text"]
    files = input_dict["files"]

    images = []
    video_extensions = [".mp4", ".avi", ".mov", ".mkv"]

    if files:
        for file in files:
            ext = Path(file).suffix.lower()
            if ext in video_extensions:
                try:
                    # Используем параллельное извлечение кадров с интервалом 2 секунды
                    frames = extract_frames_parallel(file, interval=2.0)
                    if frames:
                        images.extend(frames)
                    else:
                        gr.Error("Не удалось извлечь кадры из видео.")
                        return
                except Exception as e:
                    gr.Error(f"Ошибка при обработке видеофайла: {e}")
                    return
            else:
                images.append(load_image(file))

    # Проверка входных данных
    if text == "" and not images:
        gr.Error("Пожалуйста, введите запрос и, опционально, прикрепите изображение/видео.")
        return
    if text == "" and images:
        gr.Error("Пожалуйста, введите текстовый запрос вместе с изображением/видео.")
        return

    # Подготовка сообщений для модели
    messages = [
        {
            "role": "user",
            "content": [
                *[{"type": "image", "image": image} for image in images],
                {"type": "text", "text": text},
            ],
        }
    ]

    # Применяем шаблон чата и подготавливаем входные данные
    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt],
        images=images if images else None,
        return_tensors="pt",
        padding=True,
    ).to("cuda")

    # Настраиваем стриминг вывода в реальном времени
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)

    # Запускаем генерацию в отдельном потоке
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Стримим вывод
    buffer = ""
    yield "Думаю..."
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)
        yield buffer

# Примеры входных данных
examples = [
    [{"text": "Опиши документ?", "files": ["example_images/document.jpg"]}],
    [{"text": "Что написано на изображении?", "files": ["example_images/math.jpg"]}],
    [{"text": "О чем этот UI?", "files": ["example_images/s2w_example.png"]}],
    [{"text": "Где происходят сильные засухи по диаграмме?", "files": ["example_images/examples_weather_events.png"]}],
    # Пример с видео (убедитесь, что файл существует)
    # [{"text": "Найди нужный объект в видео.", "files": ["example_videos/sample.mp4"]}],
]

demo = gr.ChatInterface(
    fn=model_inference,
    description="# **Qwen2.5-VL-7B-Instruct**\nТеперь видео обрабатываются параллельно для ускорения извлечения кадров.",
    examples=examples,
    textbox=gr.MultimodalTextbox(label="Запрос (текст + изображение/видео)", file_types=["image", "video"], file_count="multiple"),
    stop_btn="Остановить генерацию",
    multimodal=True,
    cache_examples=False,
)

demo.launch(debug=True)