import gradio as gr
import torch
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates
import copy
from decord import VideoReader, cpu
import numpy as np

title = "# 📸 Instagram Reels Analiz Aracı"
description = """Bu araç, yüklenen Instagram Reels videolarını analiz eder ve içeriği özetler. 
Video hakkında genel bir açıklama yapar ve klipte neler olup bittiğini adım adım anlatır."""

def load_video(video_path, max_frames_num=64, fps=1):
    vr = VideoReader(video_path, ctx=cpu(0))
    total_frame_num = len(vr)
    frame_idx = list(range(0, total_frame_num, int(vr.get_avg_fps() / fps)))
    
    if len(frame_idx) > max_frames_num:
        frame_idx = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int).tolist()
    
    video_frames = vr.get_batch(frame_idx).asnumpy()
    return video_frames, len(frame_idx)

# Model yükleme
pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
model_name = "llava_qwen"
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Model yükleniyor...")
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map="auto")
model.eval()
print("Model başarıyla yüklendi!")

def analyze_reel(video_path):
    video_frames, frame_count = load_video(video_path)
    video = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].to(device).bfloat16()
    
    prompt = f"{DEFAULT_IMAGE_TOKEN}Bu Instagram Reels videosunu analiz et. Önce videonun genel içeriğini özetle, ardından klipte neler olup bittiğini adım adım açıkla. Video {frame_count} kareye bölünmüştür."
    
    conv = copy.deepcopy(conv_templates["qwen_1_5"])
    conv.append_message(conv.roles[0], prompt)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    
    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            images=[video],
            modalities=["video"],
            do_sample=False,
            temperature=0,
            max_new_tokens=1024,
        )
    
    response = tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
    return response

def gradio_interface(video_file):
    if video_file is None:
        return "Lütfen bir video dosyası yükleyin."
    return analyze_reel(video_file)

with gr.Blocks() as demo:
    gr.Markdown(title)
    gr.Markdown(description)
    
    with gr.Row():
        video_input = gr.Video(label="Instagram Reels Videosu")
        output = gr.Textbox(label="Analiz Sonucu", lines=10)
    
    analyze_button = gr.Button("Reels'i Analiz Et")
    analyze_button.click(fn=gradio_interface, inputs=video_input, outputs=output)

if __name__ == "__main__":
    demo.launch(share=True)