import gradio as gr
import torch
import yaml
import os
from pathlib import Path
from modules.fslip import FastLip
from modules.base_model import BaseModel
import numpy as np
import cv2
from moviepy.editor import VideoFileClip
import tempfile

# Load configuration
def load_config():
    with open('configs/lipgen/grid/lipgen_grid.yaml', 'r') as f:
        config = yaml.safe_load(f)
    return config

# Initialize model
def init_model():
    config = load_config()
    model = FastLip(
        arch=config['arch'],
        dictionary=None,  # We'll need to implement a simple dictionary
        out_dims=None
    )
    # Load checkpoint
    checkpoint = torch.load('checkpoints/lipgen_grid.pt', map_location='cpu')
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()
    return model

# Process video frames
def process_video(video_path, target_language):
    model = init_model()
    
    # Load video
    video = VideoFileClip(video_path)
    frames = []
    for frame in video.iter_frames():
        # Resize frame to match model input size (80x160)
        frame = cv2.resize(frame, (160, 80))
        frames.append(frame)
    
    # Convert frames to tensor
    frames = torch.FloatTensor(np.array(frames)).permute(0, 3, 1, 2) / 255.0
    
    # Process with model
    with torch.no_grad():
        # TODO: Implement text processing for target language
        # For now, we'll just return the processed frames
        output = model(frames.unsqueeze(0))
    
    # Convert output to video
    output_frames = output['lip_out'].squeeze(0).cpu().numpy()
    output_frames = (output_frames * 255).astype(np.uint8)
    
    # Save to temporary file
    temp_dir = tempfile.mkdtemp()
    output_path = os.path.join(temp_dir, 'output.mp4')
    
    # Create video from frames
    height, width = output_frames.shape[2:4]
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, 25.0, (width, height))
    
    for frame in output_frames:
        frame = frame.transpose(1, 2, 0)
        out.write(frame)
    out.release()
    
    return output_path

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="ParaLip Video Dubbing") as demo:
        gr.Markdown("""
        # ParaLip Video Dubbing
        Upload a video and select a target language to create a dubbed version.
        """)
        
        with gr.Row():
            with gr.Column():
                video_input = gr.Video(label="Upload Video")
                language = gr.Dropdown(
                    choices=["spanish", "french", "german", "italian", "portuguese"],
                    value="spanish",
                    label="Target Language"
                )
                dub_button = gr.Button("Dub Video")
            
            with gr.Column():
                status = gr.Textbox(label="Status")
                video_output = gr.Video(label="Dubbed Video")
        
        def process_video_wrapper(video_file, target_lang):
            if video_file is None:
                return "Please upload a video file", None
            
            try:
                # Save uploaded file temporarily
                temp_path = Path("temp_video.mp4")
                with open(temp_path, "wb") as f:
                    f.write(video_file.read())
                
                # Process video
                output_path = process_video(temp_path, target_lang)
                
                # Clean up
                temp_path.unlink()
                
                return "Dubbing completed successfully!", output_path
                
            except Exception as e:
                return f"Error during dubbing: {str(e)}", None
        
        dub_button.click(
            fn=process_video_wrapper,
            inputs=[video_input, language],
            outputs=[status, video_output]
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()