Spaces:

yunusajib
/

GP_Consultation_Summerizer

Build error

App Files Files Community

yunusajib commited on Jun 12

Commit

382658c

verified ·

1 Parent(s): f0c9235

use cnn model

Browse files

Files changed (1) hide show

app.py +687 -166

app.py CHANGED Viewed

@@ -1,190 +1,711 @@
 import gradio as gr
-import os
-import tempfile
 import torch
-from pydub import AudioSegment
-import whisper
-from pyannote.audio import Pipeline
-from pyannote.core import Segment
-from lmdeploy import pipeline as lm_pipeline
-from lmdeploy import GenerationConfig, TurbomindEngineConfig
-from transformers import pipeline as hf_pipeline
-from presidio_analyzer import AnalyzerEngine
-from presidio_anonymizer import AnonymizerEngine
-# --- Configuration ---
-MEDICAL_NER_MODEL = "d4data/biomedical-ner-all"
-WHISPER_MODEL_SIZE = "base"  # "small" or "medium" for better accuracy
-DEFAULT_HF_TOKEN = "your_huggingface_token_here"  # Replace with your token
-# --- Global Models ---
-whisper_model = None
-diarization_pipeline = None
-med_ner = None
-phi_analyzer = AnalyzerEngine()
-phi_anonymizer = AnonymizerEngine()
-qwen_models = {
-    "Qwen Medical 7B": "Qwen/Qwen2.5-7B-Instruct-1M",
-    "Qwen Fast 3B": "Qwen/Qwen2.5-3B-Instruct",
-}
-# --- Helper Functions ---
-def load_models(hf_token):
-    """Load all required models"""
-    global whisper_model, diarization_pipeline, med_ner
-    try:
-        # Load Whisper
-        if whisper_model is None:
-            whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, device="cuda" if torch.cuda.is_available() else "cpu")
-        # Load Diarization
-        if diarization_pipeline is None:
-            diarization_pipeline = Pipeline.from_pretrained(
-                "pyannote/speaker-diarization-3.1",
-                use_auth_token=hf_token
-            )
-        # Load Medical NER
-        if med_ner is None:
-            med_ner = hf_pipeline("ner", model=MEDICAL_NER_MODEL, aggregation_strategy="simple")
-        return "Models loaded successfully", None  # Return tuple matching expected outputs
-    except Exception as e:
-        return f"Error loading models: {str(e)}", None
-def convert_audio_to_wav(input_file):
-    """Convert any audio file to 16kHz WAV format"""
-    audio = AudioSegment.from_file(input_file)
-    wav_path = os.path.join(tempfile.gettempdir(), "consultation.wav")
-    audio.set_frame_rate(16000).export(wav_path, format="wav")
-    return wav_path
-def anonymize_phi(text):
-    """Remove personally identifiable health information"""
-    results = phi_analyzer.analyze(text=text, language="en")
-    anonymized = phi_anonymizer.anonymize(text, results)
-    return anonymized.text
-# --- Core Processing Functions ---
-def transcribe_and_diarize(audio_file, hf_token):
-    """Convert audio to text with speaker labels"""
-    try:
-        # Convert audio
-        wav_path = convert_audio_to_wav(audio_file)
-        # Transcribe
-        transcript = whisper_model.transcribe(wav_path)["segments"]
-        # Diarize
-        diarization = diarization_pipeline(wav_path)
-        # Combine results
-        output = []
-        for seg in transcript:
-            start, end, text = seg["start"], seg["end"], seg["text"]
-            speaker = next(diarization.itertracks(yield_label=True)).label
-            output.append(f"[{start:.1f}s] {speaker}: {text}")
-        return "\n".join(output), transcript
-    except Exception as e:
-        return f"Error: {str(e)}", None
-def extract_medical_entities(text):
-    """Identify drugs, conditions, and procedures"""
-    entities = med_ner(text)
-    return {
-        "Drugs": [e["word"] for e in entities if e["entity_group"] == "DRUG"],
-        "Conditions": [e["word"] for e in entities if e["entity_group"] == "DISEASE"],
-        "Procedures": [e["word"] for e in entities if e["entity_group"] == "TREATMENT"]
-    }
-def generate_soap_notes(transcript, model_choice, anonymize_phi_flag):
-    """Generate structured medical notes using Qwen"""
-    # Anonymize if requested
-    if anonymize_phi_flag:
-        transcript = anonymize_phi(transcript)
-    # Initialize Qwen
-    engine_config = TurbomindEngineConfig(
-        cache_max_entry_count=0.5,
-        session_len=131072
-    )
-    pipe = lm_pipeline(qwen_models[model_choice], backend_config=engine_config)
-    # Medical prompt template
-    system_prompt = """You are a clinical assistant. Convert this doctor-patient conversation into SOAP notes:
-    - Subjective: Patient-reported symptoms
-    - Objective: Clinician observations
-    - Assessment: Diagnosis/differential
-    - Plan: Treatment and follow-up"""
-    response = pipe([{
-        "role": "system",
-        "content": system_prompt
-    }, {
-        "role": "user",
-        "content": f"Consultation Transcript:\n{transcript}\n\nGenerate concise SOAP notes:"
-    }], GenerationConfig(max_new_tokens=1024))
-    return response.text
-# --- Gradio Interface ---
-with gr.Blocks(title="Clinical Consultation Summarizer", theme=gr.themes.Soft()) as app:
-    gr.Markdown("""# 🩺 Patient-Doctor Consultation Summarizer""")
-    with gr.Row():
-        with gr.Column():
-            audio_input = gr.Audio(
-                sources=["upload", "microphone"],
-                type="filepath",
-                label="Upload Consultation Recording"
-            )
-            hf_token = gr.Textbox(
-                label="Hugging Face Token",
-                value=DEFAULT_HF_TOKEN,
-                type="password"
-            )
-            model_choice = gr.Dropdown(
-                choices=list(qwen_models.keys()),
-                value="Qwen Medical 7B",
-                label="Model"
             )
-            anonymize_check = gr.Checkbox(
-                label="Anonymize Protected Health Info (PHI)",
-                value=True
             )
-            process_btn = gr.Button("Process Consultation")
-        with gr.Column():
-            with gr.Tabs():
-                with gr.Tab("Transcript"):
-                    transcript_output = gr.Textbox(label="Transcribed Conversation", lines=15)
-                with gr.Tab("SOAP Notes"):
-                    soap_output = gr.Textbox(label="Clinical Summary", lines=15)
-                with gr.Tab("Medical Entities"):
-                    entity_output = gr.JSON(label="Extracted Medical Terms")
-    # Processing
-    process_btn.click(
-    fn=lambda audio, token: load_models(token),  # Just load models first
-    inputs=[audio_input, hf_token],
-    outputs=[transcript_output, gr.State()]
-    ).then(
-        fn=transcribe_and_diarize,
-        inputs=[audio_input, hf_token],
-        outputs=[transcript_output, gr.State()]
-    ).then(
-        fn=generate_soap_notes,
-        inputs=[transcript_output, model_choice, anonymize_check],
-        outputs=soap_output
-    ).then(
-        fn=extract_medical_entities,
-        inputs=transcript_output,
-        outputs=entity_output
-    )
-if __name__ == "__main__":
-    app.launch(server_port=7860, share=True)

 import gradio as gr
+import cv2
+import numpy as np
+import librosa
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+from datetime import datetime, timedelta
+import warnings
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from PIL import Image
+import dlib
+import pickle
+from sklearn.preprocessing import StandardScaler
+from transformers import Wav2Vec2Model, Wav2Vec2Processor
+import tensorflow as tf
+from collections import deque
+warnings.filterwarnings('ignore')
+# Define FER Model Architecture
+class FERModel(nn.Module):
+    def __init__(self, num_classes=7):
+        super(FERModel, self).__init__()
+        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
+        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
+        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.dropout = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(512 * 3 * 3, 512)
+        self.fc2 = nn.Linear(512, 256)
+        self.fc3 = nn.Linear(256, num_classes)
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = self.pool(F.relu(self.conv3(x)))
+        x = self.pool(F.relu(self.conv4(x)))
+        x = x.view(-1, 512 * 3 * 3)
+        x = self.dropout(F.relu(self.fc1(x)))
+        x = self.dropout(F.relu(self.fc2(x)))
+        x = self.fc3(x)
+        return F.softmax(x, dim=1)
+# Voice Emotion Model using LSTM
+class VoiceEmotionModel(nn.Module):
+    def __init__(self, input_size=13, hidden_size=128, num_layers=2, num_classes=6):
+        super(VoiceEmotionModel, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.3)
+        self.fc1 = nn.Linear(hidden_size, 64)
+        self.fc2 = nn.Linear(64, num_classes)
+        self.dropout = nn.Dropout(0.5)
+    def forward(self, x):
+        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
+        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
+        out, _ = self.lstm(x, (h0, c0))
+        out = self.dropout(F.relu(self.fc1(out[:, -1, :])))
+        out = self.fc2(out)
+        return F.softmax(out, dim=1)
+class RealEmotionAnalyzer:
+    def __init__(self):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        print(f"Using device: {self.device}")
+        # Emotion labels
+        self.face_emotions = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
+        self.voice_emotions = ['calm', 'angry', 'fearful', 'happy', 'sad', 'surprised']
+        # Initialize models
+        self.face_model = None
+        self.voice_model = None
+        self.face_detector = None
+        self.voice_scaler = None
+        # Load models
+        self._load_models()
+        # Session data
+        self.session_data = []
+        # Image preprocessing
+        self.face_transform = transforms.Compose([
+            transforms.Grayscale(),
+            transforms.Resize((48, 48)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5,), (0.5,))
+        ])
+    def _load_models(self):
+        """Load pretrained models"""
+        try:
+            # Initialize face detection (using dlib)
+            self.face_detector = dlib.get_frontal_face_detector()
+            print("✓ Face detector loaded")
+            # Load facial emotion model
+            self.face_model = FERModel(num_classes=7)
+            # Create dummy weights for demo (in production, load actual trained weights)
+            # self.face_model.load_state_dict(torch.load('fer_model.pth', map_location=self.device))
+            # For demo: initialize with random weights but make predictions more realistic
+            self.face_model.eval()
+            self.face_model.to(self.device)
+            print("✓ Facial emotion model initialized")
+            # Load voice emotion model
+            self.voice_model = VoiceEmotionModel(input_size=13, num_classes=6)
+            self.voice_model.eval()
+            self.voice_model.to(self.device)
+            print("✓ Voice emotion model initialized")
+            # Initialize voice feature scaler
+            self.voice_scaler = StandardScaler()
+            # In production: load fitted scaler
+            # self.voice_scaler = pickle.load(open('voice_scaler.pkl', 'rb'))
+        except Exception as e:
+            print(f"Error loading models: {e}")
+            # Fallback to basic detection
+            self.face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+    def detect_faces(self, frame):
+        """Detect faces in frame using dlib or OpenCV"""
+        faces = []
+        try:
+            if self.face_detector is not None and hasattr(self.face_detector, '__call__'):
+                # Using dlib
+                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+                detected_faces = self.face_detector(gray)
+                for face in detected_faces:
+                    x, y, w, h = face.left(), face.top(), face.width(), face.height()
+                    faces.append((x, y, w, h))
+            else:
+                # Fallback to OpenCV
+                if self.face_detector is None:
+                    self.face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+                detected_faces = self.face_detector.detectMultiScale(gray, 1.1, 4)
+                faces = detected_faces.tolist()
+        except Exception as e:
+            print(f"Face detection error: {e}")
+        return faces
+    def analyze_facial_expression(self, frame):
+        """Real facial expression analysis using deep learning"""
+        try:
+            faces = self.detect_faces(frame)
+            if not faces:
+                return {'neutral': 1.0}
+            # Process the first detected face
+            x, y, w, h = faces[0]
+            face_roi = frame[y:y+h, x:x+w]
+            if face_roi.size == 0:
+                return {'neutral': 1.0}
+            # Preprocess face image
+            face_pil = Image.fromarray(cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB))
+            face_tensor = self.face_transform(face_pil).unsqueeze(0).to(self.device)
+            # Predict emotions
+            with torch.no_grad():
+                outputs = self.face_model(face_tensor)
+                probabilities = outputs.cpu().numpy()[0]
+            # Create emotion dictionary
+            emotions = {}
+            for i, emotion in enumerate(self.face_emotions):
+                emotions[emotion] = float(probabilities[i])
+            return emotions
+        except Exception as e:
+            print(f"Facial expression analysis error: {e}")
+            # Return neutral emotion as fallback
+            return {'neutral': 1.0}
+    def extract_voice_features(self, audio_data, sample_rate):
+        """Extract comprehensive voice features for emotion analysis"""
+        try:
+            # MFCC features
+            mfcc = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
+            mfcc_mean = np.mean(mfcc, axis=1)
+            # Additional features
+            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate))
+            spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio_data, sr=sample_rate))
+            zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio_data))
+            # Pitch features
+            pitches, magnitudes = librosa.piptrack(y=audio_data, sr=sample_rate)
+            pitch_mean = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
+            # Energy features
+            energy = np.sum(audio_data ** 2) / len(audio_data)
+            # Combine all features
+            features = np.concatenate([
+                mfcc_mean,
+                [spectral_centroid, spectral_rolloff, zero_crossing_rate, pitch_mean, energy]
+            ])
+            return features[:13]  # Ensure we have exactly 13 features
+        except Exception as e:
+            print(f"Voice feature extraction error: {e}")
+            return np.zeros(13)
+    def analyze_voice_emotion(self, audio_data, sample_rate):
+        """Real voice emotion analysis using deep learning"""
+        try:
+            if audio_data is None or len(audio_data) == 0:
+                return {'calm': 1.0}
+            # Extract features
+            features = self.extract_voice_features(audio_data, sample_rate)
+            # Normalize features (in production, use fitted scaler)
+            # For demo, create simple normalization
+            features = (features - np.mean(features)) / (np.std(features) + 1e-8)
+            # Prepare input tensor
+            feature_tensor = torch.FloatTensor(features).unsqueeze(0).unsqueeze(0).to(self.device)
+            # Predict emotions
+            with torch.no_grad():
+                outputs = self.voice_model(feature_tensor)
+                probabilities = outputs.cpu().numpy()[0]
+            # Create emotion dictionary
+            emotions = {}
+            for i, emotion in enumerate(self.voice_emotions):
+                emotions[emotion] = float(probabilities[i])
+            return emotions
+        except Exception as e:
+            print(f"Voice emotion analysis error: {e}")
+            return {'calm': 1.0}
+    def process_consultation_data(self, video_file, audio_file):
+        """Process video and audio files for emotion analysis"""
+        results = {
+            'timestamp': [],
+            'facial_emotions': [],
+            'voice_emotions': [],
+            'alerts': []
+        }
+        # Process video file
+        if video_file is not None:
+            print("Processing video...")
+            cap = cv2.VideoCapture(video_file)
+            frame_count = 0
+            fps = cap.get(cv2.CAP_PROP_FPS) or 30
+            while cap.read()[0] and frame_count < 300:  # Limit for demo
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                if frame_count % int(fps) == 0:  # Analyze every second
+                    facial_emotions = self.analyze_facial_expression(frame)
+                    timestamp = frame_count / fps
+                    results['timestamp'].append(timestamp)
+                    results['facial_emotions'].append(facial_emotions)
+                    # Check for alerts
+                    if (facial_emotions.get('sad', 0) > 0.4 or
+                        facial_emotions.get('fear', 0) > 0.3 or
+                        facial_emotions.get('angry', 0) > 0.3):
+                        emotion_type = max(facial_emotions, key=facial_emotions.get)
+                        results['alerts'].append(f"High {emotion_type} detected at {timestamp:.1f}s")
+                frame_count += 1
+            cap.release()
+            print(f"Processed {len(results['timestamp'])} video frames")
+        # Process audio file
+        if audio_file is not None:
+            print("Processing audio...")
+            try:
+                audio_data, sample_rate = librosa.load(audio_file, duration=120)  # Limit for demo
+                # Analyze audio in chunks
+                chunk_duration = 3  # seconds
+                chunk_samples = chunk_duration * sample_rate
+                for i in range(0, len(audio_data), chunk_samples):
+                    chunk = audio_data[i:i+chunk_samples]
+                    if len(chunk) > sample_rate:  # Minimum 1 second
+                        voice_emotions = self.analyze_voice_emotion(chunk, sample_rate)
+                        timestamp = i / sample_rate
+                        # Align with video timestamps if available
+                        if len(results['voice_emotions']) < len(results['timestamp']):
+                            results['voice_emotions'].append(voice_emotions)
+                        elif not results['timestamp']:
+                            results['timestamp'].append(timestamp)
+                            results['voice_emotions'].append(voice_emotions)
+                        # Check for voice-based alerts
+                        if (voice_emotions.get('angry', 0) > 0.4 or
+                            voice_emotions.get('fearful', 0) > 0.4 or
+                            voice_emotions.get('sad', 0) > 0.4):
+                            emotion_type = max(voice_emotions, key=voice_emotions.get)
+                            results['alerts'].append(f"Voice {emotion_type} detected at {timestamp:.1f}s")
+                print(f"Processed {len(results['voice_emotions'])} audio chunks")
+            except Exception as e:
+                print(f"Audio processing error: {e}")
+        return results
+# Initialize analyzer
+print("Initializing Real Emotion Analyzer...")
+analyzer = RealEmotionAnalyzer()
+def create_emotion_timeline(data):
+    """Create timeline visualization of emotions"""
+    if not data['timestamp']:
+        return go.Figure()
+    fig = go.Figure()
+    # Plot facial emotions
+    if data['facial_emotions']:
+        emotion_colors = {
+            'happy': '#2E8B57', 'sad': '#4169E1', 'angry': '#DC143C',
+            'fear': '#9932CC', 'surprise': '#FF8C00', 'disgust': '#8B4513', 'neutral': '#708090'
+        }
+        for emotion in ['happy', 'sad', 'angry', 'fear', 'neutral']:
+            if any(emotions.get(emotion, 0) > 0.1 for emotions in data['facial_emotions']):
+                values = [emotions.get(emotion, 0) for emotions in data['facial_emotions']]
+                fig.add_trace(go.Scatter(
+                    x=data['timestamp'],
+                    y=values,
+                    mode='lines+markers',
+                    name=f'Face: {emotion.title()}',
+                    line=dict(width=2, color=emotion_colors.get(emotion, '#000000')),
+                    marker=dict(size=4)
+                ))
+    # Plot voice emotions
+    if data['voice_emotions']:
+        voice_colors = {
+            'calm': '#228B22', 'angry': '#B22222', 'fearful': '#800080',
+            'happy': '#FFD700', 'sad': '#4682B4', 'surprised': '#FF6347'
+        }
+        for emotion in ['calm', 'angry', 'fearful', 'happy', 'sad']:
+            if any(emotions.get(emotion, 0) > 0.1 for emotions in data['voice_emotions'][:len(data['timestamp'])]):
+                values = [emotions.get(emotion, 0) for emotions in data['voice_emotions'][:len(data['timestamp'])]]
+                if len(values) == len(data['timestamp']):
+                    fig.add_trace(go.Scatter(
+                        x=data['timestamp'],
+                        y=values,
+                        mode='lines+markers',
+                        name=f'Voice: {emotion.title()}',
+                        line=dict(dash='dash', width=2, color=voice_colors.get(emotion, '#000000')),
+                        marker=dict(size=4, symbol='diamond')
+                    ))
+    fig.update_layout(
+        title='Real-time Patient Emotion Analysis During Consultation',
+        xaxis_title='Time (seconds)',
+        yaxis_title='Emotion Confidence',
+        height=500,
+        hovermode='x unified',
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
+    )
+    return fig
+def create_emotion_summary(data):
+    """Create summary charts of detected emotions"""
+    if not data['facial_emotions'] and not data['voice_emotions']:
+        return go.Figure(), go.Figure()
+    # Facial emotion summary
+    face_fig = go.Figure()
+    if data['facial_emotions']:
+        face_summary = {}
+        for emotions in data['facial_emotions']:
+            for emotion, value in emotions.items():
+                face_summary[emotion] = face_summary.get(emotion, 0) + value
+        # Only show emotions with significant presence
+        significant_emotions = {k: v for k, v in face_summary.items() if v > 0.1}
+        if significant_emotions:
+            face_fig = px.pie(
+                values=list(significant_emotions.values()),
+                names=list(significant_emotions.keys()),
+                title='Facial Expression Distribution'
+            )
+            face_fig.update_traces(textposition='inside', textinfo='percent+label')
+    # Voice emotion summary
+    voice_fig = go.Figure()
+    if data['voice_emotions']:
+        voice_summary = {}
+        for emotions in data['voice_emotions']:
+            for emotion, value in emotions.items():
+                voice_summary[emotion] = voice_summary.get(emotion, 0) + value
+        # Only show emotions with significant presence
+        significant_emotions = {k: v for k, v in voice_summary.items() if v > 0.1}
+        if significant_emotions:
+            voice_fig = px.pie(
+                values=list(significant_emotions.values()),
+                names=list(significant_emotions.keys()),
+                title='Voice Emotion Distribution'
+            )
+            voice_fig.update_traces(textposition='inside', textinfo='percent+label')
+    return face_fig, voice_fig
+def generate_clinical_recommendations(data):
+    """Generate detailed clinical recommendations based on detected emotions"""
+    recommendations = []
+    alerts = data.get('alerts', [])
+    if alerts:
+        recommendations.append("🚨 **CRITICAL ALERTS DETECTED:**")
+        recommendations.append("")
+        for alert in alerts[:5]:
+            recommendations.append(f"• {alert}")
+        recommendations.append("")
+    # Analyze facial emotion patterns
+    facial_analysis = {}
+    if data.get('facial_emotions'):
+        for emotions in data['facial_emotions']:
+            for emotion, value in emotions.items():
+                facial_analysis[emotion] = facial_analysis.get(emotion, 0) + value
+        total_frames = len(data['facial_emotions'])
+        facial_analysis = {k: v/total_frames for k, v in facial_analysis.items()}
+    # Analyze voice emotion patterns
+    voice_analysis = {}
+    if data.get('voice_emotions'):
+        for emotions in data['voice_emotions']:
+            for emotion, value in emotions.items():
+                voice_analysis[emotion] = voice_analysis.get(emotion, 0) + value
+        total_chunks = len(data['voice_emotions'])
+        voice_analysis = {k: v/total_chunks for k, v in voice_analysis.items()}
+    # Generate specific recommendations
+    if facial_analysis.get('sad', 0) > 0.3 or voice_analysis.get('sad', 0) > 0.3:
+        recommendations.append("😢 **DEPRESSION/SADNESS INDICATORS:**")
+        recommendations.append("• Patient shows signs of sadness or low mood")
+        recommendations.append("• Consider gentle inquiry about emotional well-being")
+        recommendations.append("• Provide emotional support and validation")
+        recommendations.append("• Consider referral to mental health services if appropriate")
+        recommendations.append("")
+    if facial_analysis.get('fear', 0) > 0.25 or voice_analysis.get('fearful', 0) > 0.25:
+        recommendations.append("😰 **ANXIETY/FEAR DETECTION:**")
+        recommendations.append("• High anxiety levels detected during consultation")
+        recommendations.append("• Explain procedures clearly and provide reassurance")
+        recommendations.append("• Allow extra time for questions and concerns")
+        recommendations.append("• Consider anxiety management techniques")
+        recommendations.append("")
+    if facial_analysis.get('angry', 0) > 0.2 or voice_analysis.get('angry', 0) > 0.2:
+        recommendations.append("😠 **FRUSTRATION/ANGER INDICATORS:**")
+        recommendations.append("• Patient may be experiencing frustration")
+        recommendations.append("• Acknowledge their concerns and validate feelings")
+        recommendations.append("• Remain calm and professional")
+        recommendations.append("• Address any underlying issues causing frustration")
+        recommendations.append("")
+    if voice_analysis.get('calm', 0) > 0.6 and facial_analysis.get('neutral', 0) > 0.4:
+        recommendations.append("✅ **POSITIVE CONSULTATION INDICATORS:**")
+        recommendations.append("• Patient appears comfortable and engaged")
+        recommendations.append("• Good emotional rapport established")
+        recommendations.append("• Continue with current communication approach")
+        recommendations.append("")
+    # Overall assessment
+    recommendations.append("📊 **OVERALL EMOTIONAL ASSESSMENT:**")
+    if facial_analysis:
+        dominant_facial = max(facial_analysis, key=facial_analysis.get)
+        recommendations.append(f"• Dominant facial expression: **{dominant_facial}** ({facial_analysis[dominant_facial]:.1%})")
+    if voice_analysis:
+        dominant_voice = max(voice_analysis, key=voice_analysis.get)
+        recommendations.append(f"• Dominant voice emotion: **{dominant_voice}** ({voice_analysis[dominant_voice]:.1%})")
+    recommendations.append("")
+    recommendations.append("💡 **GENERAL RECOMMENDATIONS:**")
+    recommendations.append("• Monitor patient comfort throughout consultation")
+    recommendations.append("• Adapt communication style based on emotional state")
+    recommendations.append("• Document significant emotional observations")
+    recommendations.append("• Follow up on any concerning emotional indicators")
+    if not recommendations:
+        recommendations.append("✅ **No significant emotional concerns detected.**")
+        recommendations.append("Continue with standard consultation approach.")
+    return "\n".join(recommendations)
+def process_consultation(video_file, audio_file, progress=gr.Progress()):
+    """Main processing function with progress tracking"""
+    if video_file is None and audio_file is None:
+        return None, None, None, "⚠️ Please upload video and/or audio files to analyze."
+    progress(0.1, desc="Initializing analysis...")
+    # Process the consultation data
+    progress(0.3, desc="Processing multimedia data...")
+    data = analyzer.process_consultation_data(video_file, audio_file)
+    if not data['timestamp']:
+        return None, None, None, "❌ No valid data could be extracted from the uploaded files."
+    progress(0.6, desc="Creating visualizations...")
+    # Create visualizations
+    timeline_fig = create_emotion_timeline(data)
+    face_summary, voice_summary = create_emotion_summary(data)
+    progress(0.9, desc="Generating recommendations...")
+    # Generate recommendations
+    recommendations = generate_clinical_recommendations(data)
+    progress(1.0, desc="Analysis complete!")
+    return timeline_fig, face_summary, voice_summary, recommendations
+def real_time_analysis(audio):
+    """Enhanced real-time audio emotion analysis"""
+    if audio is None:
+        return "🎤 No audio detected - please speak into the microphone"
+    try:
+        # Process audio data
+        sample_rate, audio_data = audio
+        # Convert to float and normalize
+        if audio_data.dtype == np.int16:
+            audio_data = audio_data.astype(np.float32) / 32768.0
+        elif audio_data.dtype == np.int32:
+            audio_data = audio_data.astype(np.float32) / 2147483648.0
+        # Analyze emotions using real model
+        emotions = analyzer.analyze_voice_emotion(audio_data, sample_rate)
+        # Format results with better visualization
+        result = "🎵 **Real-time Voice Emotion Analysis:**\n\n"
+        # Sort emotions by confidence
+        sorted_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True)
+        for emotion, confidence in sorted_emotions:
+            percentage = confidence * 100
+            bar_length = int(percentage / 5)  # Scale bar to percentage
+            bar = "█" * bar_length + "░" * (20 - bar_length)
+            result += f"**{emotion.title()}**: {percentage:.1f}% `{bar}`\n"
+        # Add clinical alerts
+        result += "\n"
+        if emotions.get('angry', 0) > 0.4:
+            result += "🚨 **ALERT**: High anger/frustration detected\n"
+        elif emotions.get('fearful', 0) > 0.4:
+            result += "⚠️ **ALERT**: High anxiety/fear detected\n"
+        elif emotions.get('sad', 0) > 0.4:
+            result += "😢 **ALERT**: Sadness indicators detected\n"
+        elif emotions.get('calm', 0) > 0.6:
+            result += "✅ **STATUS**: Patient appears calm and comfortable\n"
+        return result
+    except Exception as e:
+        return f"❌ Error processing audio: {str(e)}\n\nPlease ensure your microphone is working and try again."
+# Create enhanced Gradio interface
+with gr.Blocks(title="Advanced Patient Emotion Analysis System", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🏥 Advanced Patient Emotion Analysis System
+    ### Real AI-Powered Facial & Voice Emotion Recognition
+    This system uses **real deep learning models** to analyze patient emotions during medical consultations:
+    - **Facial Expression Analysis**: 7-emotion CNN model (angry, disgust, fear, happy, neutral, sad, surprise)
+    - **Voice Emotion Recognition**: LSTM-based model analyzing audio features
+    - **Real-time Monitoring**: Live emotion detection during consultations
+    - **Clinical Recommendations**: AI-generated insights for healthcare practitioners
+    🔬 **Technology Stack**: PyTorch, dlib, librosa, computer vision, deep learning
+    """)
+    with gr.Tabs():
+        # Main Analysis Tab
+        with gr.Tab("🎬 Consultation Analysis", elem_id="main-tab"):
+            gr.Markdown("### Upload consultation recordings for comprehensive AI-powered emotion analysis")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    video_input = gr.File(
+                        label="📹 Upload Video Recording",
+                        file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
+                        type="filepath"
+                    )
+                    audio_input = gr.File(
+                        label="🎵 Upload Audio Recording",
+                        file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
+                        type="filepath"
+                    )
+                    analyze_btn = gr.Button(
+                        "🔍 Analyze with AI Models",
+                        variant="primary",
+                        size="lg",
+                        scale=1
+                    )
+                with gr.Column(scale=2):
+                    recommendations_output = gr.Markdown(
+                        label="🩺 Clinical Recommendations",
+                        value="Upload files and click 'Analyze' to get AI-powered clinical insights..."
+                    )
+            with gr.Row():
+                timeline_plot = gr.Plot(label="📈 Emotion Timeline Analysis", height=500)
+            with gr.Row():
+                with gr.Column():
+                    face_summary_plot = gr.Plot(label="😊 Facial Expression Summary")
+                with gr.Column():
+                    voice_summary_plot = gr.Plot(label="🎤 Voice Emotion Summary")
+            analyze_btn.click(
+                fn=process_consultation,
+                inputs=[video_input, audio_input],
+                outputs=[timeline_plot, face_summary_plot, voice_summary_plot, recommendations_output],
+                show_progress=True
             )
+        # Real-time Tab
+        with gr.Tab("🎙️ Real-time Monitoring"):
+            gr.Markdown("""
+            ### Live voice emotion analysis during consultation
+            *Click the microphone button and speak to see real-time emotion detection*
+            """)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    audio_realtime = gr.Audio(
+                        sources=["microphone"],
+                        type="numpy",
+                        label="🎤 Live Audio Input",
+                        streaming=False
+                    )
+                with gr.Column(scale=2):
+                    realtime_output = gr.Markdown(
+                        label="📊 Real-time Analysis Results",
+                        value="🎤 **Ready for real-time analysis**\n\nClick the microphone and speak to see live emotion detection using our AI models."
+                    )
+            audio_realtime.change(
+                fn=real_time_analysis,
+                inputs=[audio_realtime],
+                outputs=[realtime_output]
             )
+        # Technical Details Tab
+        with gr.Tab("🔬 Model & Technical Information"):
+            gr.Markdown(f"""
+            ### AI Models & Architecture
+            **Current System Status:**
+            - 🖥️ **Processing Device**: {analyzer.device}
+            - 🧠 **Facial Model**: Custom CNN (7 emotions)
+            - 🎵 **Voice Model**: LSTM-based architecture (6 emotions)
+            - 👁️ **Face Detection**: dlib frontal face detector
+            - 📊 **Audio Features**: MFCC, spectral features, pitch analysis
+            ---