import gradio as gr
import numpy as np
import cv2
import torch
import torchvision.transforms as transforms
from fer import FER
import librosa
from python_speech_features import mfcc
import pandas as pd
from datetime import datetime
import time
from transformers import pipeline

# Initialize models
emotion_detector = FER(mtcnn=True)  # Facial expression recognition
voice_classifier = pipeline("audio-classification", model="superb/hubert-base-superb-er")

# Global variables to store results
emotion_history = []
current_emotions = {"face": "Neutral", "voice": "Neutral"}
last_update_time = time.time()

# Preprocessing for face detection
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((48, 48)),
    transforms.Grayscale(),
    transforms.ToTensor(),
])

def analyze_face(frame):
    """Analyze facial expressions in the frame"""
    try:
        # Convert frame to RGB (FER expects RGB)
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Detect emotions
        results = emotion_detector.detect_emotions(rgb_frame)
        
        if results:
            emotions = results[0]['emotions']
            dominant_emotion = max(emotions, key=emotions.get)
            return dominant_emotion, emotions
        return "Neutral", {"angry": 0, "disgust": 0, "fear": 0, "happy": 0, "sad": 0, "surprise": 0, "neutral": 1}
    except Exception as e:
        print(f"Face analysis error: {e}")
        return "Neutral", {"angry": 0, "disgust": 0, "fear": 0, "happy": 0, "sad": 0, "surprise": 0, "neutral": 1}

def analyze_voice(audio):
    """Analyze voice tone from audio"""
    try:
        sr, y = audio
        y = y.astype(np.float32)
        
        # Convert to mono if stereo
        if len(y.shape) > 1:
            y = np.mean(y, axis=0)
            
        # Resample to 16kHz if needed
        if sr != 16000:
            y = librosa.resample(y, orig_sr=sr, target_sr=16000)
            sr = 16000
            
        # Classify emotion
        result = voice_classifier({"sampling_rate": sr, "raw": y})
        dominant_emotion = result[0]['label']
        return dominant_emotion, result
    except Exception as e:
        print(f"Voice analysis error: {e}")
        return "neutral", [{"label": "neutral", "score": 1.0}]

def update_emotion_history(face_emotion, voice_emotion):
    """Update the emotion history and current emotions"""
    global current_emotions, emotion_history, last_update_time
    
    current_time = datetime.now().strftime("%H:%M:%S")
    
    # Update current emotions
    current_emotions = {
        "face": face_emotion,
        "voice": voice_emotion,
        "timestamp": current_time
    }
    
    # Add to history (every 5 seconds or when emotion changes significantly)
    if (time.time() - last_update_time) > 5 or not emotion_history:
        emotion_history.append({
            "timestamp": current_time,
            "face": face_emotion,
            "voice": voice_emotion
        })
        last_update_time = time.time()
        
        # Keep only last 20 entries
        if len(emotion_history) > 20:
            emotion_history = emotion_history[-20:]

def get_emotion_timeline():
    """Create a timeline DataFrame for display"""
    if not emotion_history:
        return pd.DataFrame(columns=["Time", "Facial Emotion", "Voice Emotion"])
    
    df = pd.DataFrame(emotion_history)
    df = df.rename(columns={
        "timestamp": "Time",
        "face": "Facial Emotion",
        "voice": "Voice Emotion"
    })
    return df

def get_practitioner_advice(face_emotion, voice_emotion):
    """Generate suggestions based on detected emotions"""
    advice = []
    
    # Facial emotion advice
    if face_emotion in ["sad", "fear"]:
        advice.append("Patient appears distressed. Consider speaking more slowly and with reassurance.")
    elif face_emotion == "angry":
        advice.append("Patient seems frustrated. Acknowledge their concerns and maintain calm demeanor.")
    elif face_emotion == "confused":
        advice.append("Patient may not understand. Consider rephrasing or providing more explanation.")
    elif face_emotion == "pain":
        advice.append("Patient appears to be in pain. Consider asking about discomfort.")
    
    # Voice emotion advice
    if voice_emotion in ["sad", "fear"]:
        advice.append("Patient's tone suggests anxiety. Provide clear explanations and emotional support.")
    elif voice_emotion == "angry":
        advice.append("Patient sounds upset. Practice active listening and validate their feelings.")
    elif voice_emotion == "happy":
        advice.append("Patient seems positive. This may be a good time to discuss treatment options.")
    
    return "\n".join(advice) if advice else "Patient appears neutral. Continue with consultation."

def process_input(video, audio):
    """Process video and audio inputs to detect emotions"""
    try:
        # Process video frame
        if video is not None:
            frame = cv2.cvtColor(video, cv2.COLOR_RGB2BGR)
            face_emotion, face_details = analyze_face(frame)
        else:
            face_emotion, face_details = "Neutral", {}
        
        # Process audio
        if audio is not None:
            voice_emotion, voice_details = analyze_voice(audio)
        else:
            voice_emotion, voice_details = "neutral", {}
        
        # Update history and get outputs
        update_emotion_history(face_emotion, voice_emotion)
        timeline_df = get_emotion_timeline()
        advice = get_practitioner_advice(face_emotion, voice_emotion)
        
        # Prepare outputs
        outputs = {
            "current_face": face_emotion,
            "current_voice": voice_emotion,
            "timeline": timeline_df,
            "advice": advice,
            "face_details": str(face_details),
            "voice_details": str(voice_details)
        }
        
        return outputs
    except Exception as e:
        print(f"Processing error: {e}")
        return {
            "current_face": "Error",
            "current_voice": "Error",
            "timeline": pd.DataFrame(),
            "advice": "System error occurred",
            "face_details": "",
            "voice_details": ""
        }

# Gradio interface
with gr.Blocks(title="Patient Emotion Recognition", theme="soft") as demo:
    gr.Markdown("# Real-Time Patient Emotion Recognition")
    gr.Markdown("Analyze facial expressions and voice tone during medical consultations")
    
    with gr.Row():
        with gr.Column():
            video_input = gr.Image(label="Live Camera Feed", source="webcam", streaming=True)
            audio_input = gr.Audio(label="Voice Input", source="microphone", type="numpy")
            submit_btn = gr.Button("Analyze Emotions")
        
        with gr.Column():
            current_face = gr.Textbox(label="Current Facial Emotion")
            current_voice = gr.Textbox(label="Current Voice Emotion")
            advice_output = gr.Textbox(label="Practitioner Suggestions", lines=3)
            timeline_output = gr.Dataframe(label="Emotion Timeline", interactive=False)
            face_details = gr.Textbox(label="Face Analysis Details", visible=False)
            voice_details = gr.Textbox(label="Voice Analysis Details", visible=False)
    
    # Live processing
    video_input.change(
        process_input,
        inputs=[video_input, audio_input],
        outputs=[current_face, current_voice, timeline_output, advice_output, face_details, voice_details],
        show_progress="hidden"
    )
    
    audio_input.change(
        process_input,
        inputs=[video_input, audio_input],
        outputs=[current_face, current_voice, timeline_output, advice_output, face_details, voice_details],
        show_progress="hidden"
    )
    
    submit_btn.click(
        process_input,
        inputs=[video_input, audio_input],
        outputs=[current_face, current_voice, timeline_output, advice_output, face_details, voice_details]
    )

if __name__ == "__main__":
    demo.launch(debug=True)