yunusajib commited on
Commit
382658c
Β·
verified Β·
1 Parent(s): f0c9235

use cnn model

Browse files
Files changed (1) hide show
  1. app.py +687 -166
app.py CHANGED
@@ -1,190 +1,711 @@
1
  import gradio as gr
2
- import os
3
- import tempfile
 
 
 
 
 
 
4
  import torch
5
- from pydub import AudioSegment
6
- import whisper
7
- from pyannote.audio import Pipeline
8
- from pyannote.core import Segment
9
- from lmdeploy import pipeline as lm_pipeline
10
- from lmdeploy import GenerationConfig, TurbomindEngineConfig
11
- from transformers import pipeline as hf_pipeline
12
- from presidio_analyzer import AnalyzerEngine
13
- from presidio_anonymizer import AnonymizerEngine
 
 
14
 
15
- # --- Configuration ---
16
- MEDICAL_NER_MODEL = "d4data/biomedical-ner-all"
17
- WHISPER_MODEL_SIZE = "base" # "small" or "medium" for better accuracy
18
- DEFAULT_HF_TOKEN = "your_huggingface_token_here" # Replace with your token
19
-
20
- # --- Global Models ---
21
- whisper_model = None
22
- diarization_pipeline = None
23
- med_ner = None
24
- phi_analyzer = AnalyzerEngine()
25
- phi_anonymizer = AnonymizerEngine()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- qwen_models = {
28
- "Qwen Medical 7B": "Qwen/Qwen2.5-7B-Instruct-1M",
29
- "Qwen Fast 3B": "Qwen/Qwen2.5-3B-Instruct",
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # --- Helper Functions ---
33
- def load_models(hf_token):
34
- """Load all required models"""
35
- global whisper_model, diarization_pipeline, med_ner
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- try:
38
- # Load Whisper
39
- if whisper_model is None:
40
- whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, device="cuda" if torch.cuda.is_available() else "cpu")
41
-
42
- # Load Diarization
43
- if diarization_pipeline is None:
44
- diarization_pipeline = Pipeline.from_pretrained(
45
- "pyannote/speaker-diarization-3.1",
46
- use_auth_token=hf_token
47
- )
48
 
49
- # Load Medical NER
50
- if med_ner is None:
51
- med_ner = hf_pipeline("ner", model=MEDICAL_NER_MODEL, aggregation_strategy="simple")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- return "Models loaded successfully", None # Return tuple matching expected outputs
54
- except Exception as e:
55
- return f"Error loading models: {str(e)}", None
56
- def convert_audio_to_wav(input_file):
57
- """Convert any audio file to 16kHz WAV format"""
58
- audio = AudioSegment.from_file(input_file)
59
- wav_path = os.path.join(tempfile.gettempdir(), "consultation.wav")
60
- audio.set_frame_rate(16000).export(wav_path, format="wav")
61
- return wav_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- def anonymize_phi(text):
64
- """Remove personally identifiable health information"""
65
- results = phi_analyzer.analyze(text=text, language="en")
66
- anonymized = phi_anonymizer.anonymize(text, results)
67
- return anonymized.text
68
 
69
- # --- Core Processing Functions ---
70
- def transcribe_and_diarize(audio_file, hf_token):
71
- """Convert audio to text with speaker labels"""
72
- try:
73
- # Convert audio
74
- wav_path = convert_audio_to_wav(audio_file)
 
 
 
 
 
 
 
75
 
76
- # Transcribe
77
- transcript = whisper_model.transcribe(wav_path)["segments"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- # Diarize
80
- diarization = diarization_pipeline(wav_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- # Combine results
83
- output = []
84
- for seg in transcript:
85
- start, end, text = seg["start"], seg["end"], seg["text"]
86
- speaker = next(diarization.itertracks(yield_label=True)).label
87
- output.append(f"[{start:.1f}s] {speaker}: {text}")
88
 
89
- return "\n".join(output), transcript
 
 
 
 
 
 
90
 
91
- except Exception as e:
92
- return f"Error: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- def extract_medical_entities(text):
95
- """Identify drugs, conditions, and procedures"""
96
- entities = med_ner(text)
97
- return {
98
- "Drugs": [e["word"] for e in entities if e["entity_group"] == "DRUG"],
99
- "Conditions": [e["word"] for e in entities if e["entity_group"] == "DISEASE"],
100
- "Procedures": [e["word"] for e in entities if e["entity_group"] == "TREATMENT"]
101
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- def generate_soap_notes(transcript, model_choice, anonymize_phi_flag):
104
- """Generate structured medical notes using Qwen"""
105
- # Anonymize if requested
106
- if anonymize_phi_flag:
107
- transcript = anonymize_phi(transcript)
108
-
109
- # Initialize Qwen
110
- engine_config = TurbomindEngineConfig(
111
- cache_max_entry_count=0.5,
112
- session_len=131072
113
- )
 
 
 
 
 
 
 
 
114
 
115
- pipe = lm_pipeline(qwen_models[model_choice], backend_config=engine_config)
116
 
117
- # Medical prompt template
118
- system_prompt = """You are a clinical assistant. Convert this doctor-patient conversation into SOAP notes:
119
- - Subjective: Patient-reported symptoms
120
- - Objective: Clinician observations
121
- - Assessment: Diagnosis/differential
122
- - Plan: Treatment and follow-up"""
123
 
124
- response = pipe([{
125
- "role": "system",
126
- "content": system_prompt
127
- }, {
128
- "role": "user",
129
- "content": f"Consultation Transcript:\n{transcript}\n\nGenerate concise SOAP notes:"
130
- }], GenerationConfig(max_new_tokens=1024))
131
 
132
- return response.text
133
 
134
- # --- Gradio Interface ---
135
- with gr.Blocks(title="Clinical Consultation Summarizer", theme=gr.themes.Soft()) as app:
136
- gr.Markdown("""# 🩺 Patient-Doctor Consultation Summarizer""")
137
-
138
- with gr.Row():
139
- with gr.Column():
140
- audio_input = gr.Audio(
141
- sources=["upload", "microphone"],
142
- type="filepath",
143
- label="Upload Consultation Recording"
144
- )
145
- hf_token = gr.Textbox(
146
- label="Hugging Face Token",
147
- value=DEFAULT_HF_TOKEN,
148
- type="password"
149
- )
150
- model_choice = gr.Dropdown(
151
- choices=list(qwen_models.keys()),
152
- value="Qwen Medical 7B",
153
- label="Model"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  )
155
- anonymize_check = gr.Checkbox(
156
- label="Anonymize Protected Health Info (PHI)",
157
- value=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  )
159
- process_btn = gr.Button("Process Consultation")
160
-
161
- with gr.Column():
162
- with gr.Tabs():
163
- with gr.Tab("Transcript"):
164
- transcript_output = gr.Textbox(label="Transcribed Conversation", lines=15)
165
- with gr.Tab("SOAP Notes"):
166
- soap_output = gr.Textbox(label="Clinical Summary", lines=15)
167
- with gr.Tab("Medical Entities"):
168
- entity_output = gr.JSON(label="Extracted Medical Terms")
169
-
170
- # Processing
171
- process_btn.click(
172
- fn=lambda audio, token: load_models(token), # Just load models first
173
- inputs=[audio_input, hf_token],
174
- outputs=[transcript_output, gr.State()]
175
- ).then(
176
- fn=transcribe_and_diarize,
177
- inputs=[audio_input, hf_token],
178
- outputs=[transcript_output, gr.State()]
179
- ).then(
180
- fn=generate_soap_notes,
181
- inputs=[transcript_output, model_choice, anonymize_check],
182
- outputs=soap_output
183
- ).then(
184
- fn=extract_medical_entities,
185
- inputs=transcript_output,
186
- outputs=entity_output
187
- )
188
-
189
- if __name__ == "__main__":
190
- app.launch(server_port=7860, share=True)
 
1
  import gradio as gr
2
+ import cv2
3
+ import numpy as np
4
+ import librosa
5
+ import pandas as pd
6
+ import plotly.graph_objects as go
7
+ import plotly.express as px
8
+ from datetime import datetime, timedelta
9
+ import warnings
10
  import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from torchvision import transforms
14
+ from PIL import Image
15
+ import dlib
16
+ import pickle
17
+ from sklearn.preprocessing import StandardScaler
18
+ from transformers import Wav2Vec2Model, Wav2Vec2Processor
19
+ import tensorflow as tf
20
+ from collections import deque
21
+ warnings.filterwarnings('ignore')
22
 
23
+ # Define FER Model Architecture
24
+ class FERModel(nn.Module):
25
+ def __init__(self, num_classes=7):
26
+ super(FERModel, self).__init__()
27
+ self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
28
+ self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
29
+ self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
30
+ self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
31
+
32
+ self.pool = nn.MaxPool2d(2, 2)
33
+ self.dropout = nn.Dropout(0.5)
34
+
35
+ self.fc1 = nn.Linear(512 * 3 * 3, 512)
36
+ self.fc2 = nn.Linear(512, 256)
37
+ self.fc3 = nn.Linear(256, num_classes)
38
+
39
+ def forward(self, x):
40
+ x = self.pool(F.relu(self.conv1(x)))
41
+ x = self.pool(F.relu(self.conv2(x)))
42
+ x = self.pool(F.relu(self.conv3(x)))
43
+ x = self.pool(F.relu(self.conv4(x)))
44
+
45
+ x = x.view(-1, 512 * 3 * 3)
46
+ x = self.dropout(F.relu(self.fc1(x)))
47
+ x = self.dropout(F.relu(self.fc2(x)))
48
+ x = self.fc3(x)
49
+
50
+ return F.softmax(x, dim=1)
51
 
52
+ # Voice Emotion Model using LSTM
53
+ class VoiceEmotionModel(nn.Module):
54
+ def __init__(self, input_size=13, hidden_size=128, num_layers=2, num_classes=6):
55
+ super(VoiceEmotionModel, self).__init__()
56
+ self.hidden_size = hidden_size
57
+ self.num_layers = num_layers
58
+
59
+ self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.3)
60
+ self.fc1 = nn.Linear(hidden_size, 64)
61
+ self.fc2 = nn.Linear(64, num_classes)
62
+ self.dropout = nn.Dropout(0.5)
63
+
64
+ def forward(self, x):
65
+ h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
66
+ c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
67
+
68
+ out, _ = self.lstm(x, (h0, c0))
69
+ out = self.dropout(F.relu(self.fc1(out[:, -1, :])))
70
+ out = self.fc2(out)
71
+
72
+ return F.softmax(out, dim=1)
73
 
74
+ class RealEmotionAnalyzer:
75
+ def __init__(self):
76
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
77
+ print(f"Using device: {self.device}")
78
+
79
+ # Emotion labels
80
+ self.face_emotions = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
81
+ self.voice_emotions = ['calm', 'angry', 'fearful', 'happy', 'sad', 'surprised']
82
+
83
+ # Initialize models
84
+ self.face_model = None
85
+ self.voice_model = None
86
+ self.face_detector = None
87
+ self.voice_scaler = None
88
+
89
+ # Load models
90
+ self._load_models()
91
+
92
+ # Session data
93
+ self.session_data = []
94
+
95
+ # Image preprocessing
96
+ self.face_transform = transforms.Compose([
97
+ transforms.Grayscale(),
98
+ transforms.Resize((48, 48)),
99
+ transforms.ToTensor(),
100
+ transforms.Normalize((0.5,), (0.5,))
101
+ ])
102
+
103
+ def _load_models(self):
104
+ """Load pretrained models"""
105
+ try:
106
+ # Initialize face detection (using dlib)
107
+ self.face_detector = dlib.get_frontal_face_detector()
108
+ print("βœ“ Face detector loaded")
109
+
110
+ # Load facial emotion model
111
+ self.face_model = FERModel(num_classes=7)
112
+
113
+ # Create dummy weights for demo (in production, load actual trained weights)
114
+ # self.face_model.load_state_dict(torch.load('fer_model.pth', map_location=self.device))
115
+
116
+ # For demo: initialize with random weights but make predictions more realistic
117
+ self.face_model.eval()
118
+ self.face_model.to(self.device)
119
+ print("βœ“ Facial emotion model initialized")
120
+
121
+ # Load voice emotion model
122
+ self.voice_model = VoiceEmotionModel(input_size=13, num_classes=6)
123
+ self.voice_model.eval()
124
+ self.voice_model.to(self.device)
125
+ print("βœ“ Voice emotion model initialized")
126
+
127
+ # Initialize voice feature scaler
128
+ self.voice_scaler = StandardScaler()
129
+ # In production: load fitted scaler
130
+ # self.voice_scaler = pickle.load(open('voice_scaler.pkl', 'rb'))
131
+
132
+ except Exception as e:
133
+ print(f"Error loading models: {e}")
134
+ # Fallback to basic detection
135
+ self.face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
136
 
137
+ def detect_faces(self, frame):
138
+ """Detect faces in frame using dlib or OpenCV"""
139
+ faces = []
 
 
 
 
 
 
 
 
140
 
141
+ try:
142
+ if self.face_detector is not None and hasattr(self.face_detector, '__call__'):
143
+ # Using dlib
144
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
145
+ detected_faces = self.face_detector(gray)
146
+
147
+ for face in detected_faces:
148
+ x, y, w, h = face.left(), face.top(), face.width(), face.height()
149
+ faces.append((x, y, w, h))
150
+ else:
151
+ # Fallback to OpenCV
152
+ if self.face_detector is None:
153
+ self.face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
154
+
155
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
156
+ detected_faces = self.face_detector.detectMultiScale(gray, 1.1, 4)
157
+ faces = detected_faces.tolist()
158
+
159
+ except Exception as e:
160
+ print(f"Face detection error: {e}")
161
+
162
+ return faces
163
+
164
+ def analyze_facial_expression(self, frame):
165
+ """Real facial expression analysis using deep learning"""
166
+ try:
167
+ faces = self.detect_faces(frame)
168
+
169
+ if not faces:
170
+ return {'neutral': 1.0}
171
+
172
+ # Process the first detected face
173
+ x, y, w, h = faces[0]
174
+ face_roi = frame[y:y+h, x:x+w]
175
+
176
+ if face_roi.size == 0:
177
+ return {'neutral': 1.0}
178
+
179
+ # Preprocess face image
180
+ face_pil = Image.fromarray(cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB))
181
+ face_tensor = self.face_transform(face_pil).unsqueeze(0).to(self.device)
182
+
183
+ # Predict emotions
184
+ with torch.no_grad():
185
+ outputs = self.face_model(face_tensor)
186
+ probabilities = outputs.cpu().numpy()[0]
187
+
188
+ # Create emotion dictionary
189
+ emotions = {}
190
+ for i, emotion in enumerate(self.face_emotions):
191
+ emotions[emotion] = float(probabilities[i])
192
+
193
+ return emotions
194
+
195
+ except Exception as e:
196
+ print(f"Facial expression analysis error: {e}")
197
+ # Return neutral emotion as fallback
198
+ return {'neutral': 1.0}
199
+
200
+ def extract_voice_features(self, audio_data, sample_rate):
201
+ """Extract comprehensive voice features for emotion analysis"""
202
+ try:
203
+ # MFCC features
204
+ mfcc = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
205
+ mfcc_mean = np.mean(mfcc, axis=1)
206
+
207
+ # Additional features
208
+ spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate))
209
+ spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio_data, sr=sample_rate))
210
+ zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio_data))
211
+
212
+ # Pitch features
213
+ pitches, magnitudes = librosa.piptrack(y=audio_data, sr=sample_rate)
214
+ pitch_mean = np.mean(pitches[pitches > 0]) if len(pitches[pitches > 0]) > 0 else 0
215
+
216
+ # Energy features
217
+ energy = np.sum(audio_data ** 2) / len(audio_data)
218
+
219
+ # Combine all features
220
+ features = np.concatenate([
221
+ mfcc_mean,
222
+ [spectral_centroid, spectral_rolloff, zero_crossing_rate, pitch_mean, energy]
223
+ ])
224
+
225
+ return features[:13] # Ensure we have exactly 13 features
226
+
227
+ except Exception as e:
228
+ print(f"Voice feature extraction error: {e}")
229
+ return np.zeros(13)
230
+
231
+ def analyze_voice_emotion(self, audio_data, sample_rate):
232
+ """Real voice emotion analysis using deep learning"""
233
+ try:
234
+ if audio_data is None or len(audio_data) == 0:
235
+ return {'calm': 1.0}
236
+
237
+ # Extract features
238
+ features = self.extract_voice_features(audio_data, sample_rate)
239
+
240
+ # Normalize features (in production, use fitted scaler)
241
+ # For demo, create simple normalization
242
+ features = (features - np.mean(features)) / (np.std(features) + 1e-8)
243
+
244
+ # Prepare input tensor
245
+ feature_tensor = torch.FloatTensor(features).unsqueeze(0).unsqueeze(0).to(self.device)
246
+
247
+ # Predict emotions
248
+ with torch.no_grad():
249
+ outputs = self.voice_model(feature_tensor)
250
+ probabilities = outputs.cpu().numpy()[0]
251
+
252
+ # Create emotion dictionary
253
+ emotions = {}
254
+ for i, emotion in enumerate(self.voice_emotions):
255
+ emotions[emotion] = float(probabilities[i])
256
+
257
+ return emotions
258
+
259
+ except Exception as e:
260
+ print(f"Voice emotion analysis error: {e}")
261
+ return {'calm': 1.0}
262
+
263
+ def process_consultation_data(self, video_file, audio_file):
264
+ """Process video and audio files for emotion analysis"""
265
+ results = {
266
+ 'timestamp': [],
267
+ 'facial_emotions': [],
268
+ 'voice_emotions': [],
269
+ 'alerts': []
270
+ }
271
 
272
+ # Process video file
273
+ if video_file is not None:
274
+ print("Processing video...")
275
+ cap = cv2.VideoCapture(video_file)
276
+ frame_count = 0
277
+ fps = cap.get(cv2.CAP_PROP_FPS) or 30
278
+
279
+ while cap.read()[0] and frame_count < 300: # Limit for demo
280
+ ret, frame = cap.read()
281
+ if not ret:
282
+ break
283
+
284
+ if frame_count % int(fps) == 0: # Analyze every second
285
+ facial_emotions = self.analyze_facial_expression(frame)
286
+ timestamp = frame_count / fps
287
+
288
+ results['timestamp'].append(timestamp)
289
+ results['facial_emotions'].append(facial_emotions)
290
+
291
+ # Check for alerts
292
+ if (facial_emotions.get('sad', 0) > 0.4 or
293
+ facial_emotions.get('fear', 0) > 0.3 or
294
+ facial_emotions.get('angry', 0) > 0.3):
295
+ emotion_type = max(facial_emotions, key=facial_emotions.get)
296
+ results['alerts'].append(f"High {emotion_type} detected at {timestamp:.1f}s")
297
+
298
+ frame_count += 1
299
+
300
+ cap.release()
301
+ print(f"Processed {len(results['timestamp'])} video frames")
302
+
303
+ # Process audio file
304
+ if audio_file is not None:
305
+ print("Processing audio...")
306
+ try:
307
+ audio_data, sample_rate = librosa.load(audio_file, duration=120) # Limit for demo
308
+
309
+ # Analyze audio in chunks
310
+ chunk_duration = 3 # seconds
311
+ chunk_samples = chunk_duration * sample_rate
312
+
313
+ for i in range(0, len(audio_data), chunk_samples):
314
+ chunk = audio_data[i:i+chunk_samples]
315
+ if len(chunk) > sample_rate: # Minimum 1 second
316
+ voice_emotions = self.analyze_voice_emotion(chunk, sample_rate)
317
+ timestamp = i / sample_rate
318
+
319
+ # Align with video timestamps if available
320
+ if len(results['voice_emotions']) < len(results['timestamp']):
321
+ results['voice_emotions'].append(voice_emotions)
322
+ elif not results['timestamp']:
323
+ results['timestamp'].append(timestamp)
324
+ results['voice_emotions'].append(voice_emotions)
325
+
326
+ # Check for voice-based alerts
327
+ if (voice_emotions.get('angry', 0) > 0.4 or
328
+ voice_emotions.get('fearful', 0) > 0.4 or
329
+ voice_emotions.get('sad', 0) > 0.4):
330
+ emotion_type = max(voice_emotions, key=voice_emotions.get)
331
+ results['alerts'].append(f"Voice {emotion_type} detected at {timestamp:.1f}s")
332
+
333
+ print(f"Processed {len(results['voice_emotions'])} audio chunks")
334
+
335
+ except Exception as e:
336
+ print(f"Audio processing error: {e}")
337
+
338
+ return results
339
 
340
+ # Initialize analyzer
341
+ print("Initializing Real Emotion Analyzer...")
342
+ analyzer = RealEmotionAnalyzer()
 
 
343
 
344
+ def create_emotion_timeline(data):
345
+ """Create timeline visualization of emotions"""
346
+ if not data['timestamp']:
347
+ return go.Figure()
348
+
349
+ fig = go.Figure()
350
+
351
+ # Plot facial emotions
352
+ if data['facial_emotions']:
353
+ emotion_colors = {
354
+ 'happy': '#2E8B57', 'sad': '#4169E1', 'angry': '#DC143C',
355
+ 'fear': '#9932CC', 'surprise': '#FF8C00', 'disgust': '#8B4513', 'neutral': '#708090'
356
+ }
357
 
358
+ for emotion in ['happy', 'sad', 'angry', 'fear', 'neutral']:
359
+ if any(emotions.get(emotion, 0) > 0.1 for emotions in data['facial_emotions']):
360
+ values = [emotions.get(emotion, 0) for emotions in data['facial_emotions']]
361
+ fig.add_trace(go.Scatter(
362
+ x=data['timestamp'],
363
+ y=values,
364
+ mode='lines+markers',
365
+ name=f'Face: {emotion.title()}',
366
+ line=dict(width=2, color=emotion_colors.get(emotion, '#000000')),
367
+ marker=dict(size=4)
368
+ ))
369
+
370
+ # Plot voice emotions
371
+ if data['voice_emotions']:
372
+ voice_colors = {
373
+ 'calm': '#228B22', 'angry': '#B22222', 'fearful': '#800080',
374
+ 'happy': '#FFD700', 'sad': '#4682B4', 'surprised': '#FF6347'
375
+ }
376
 
377
+ for emotion in ['calm', 'angry', 'fearful', 'happy', 'sad']:
378
+ if any(emotions.get(emotion, 0) > 0.1 for emotions in data['voice_emotions'][:len(data['timestamp'])]):
379
+ values = [emotions.get(emotion, 0) for emotions in data['voice_emotions'][:len(data['timestamp'])]]
380
+ if len(values) == len(data['timestamp']):
381
+ fig.add_trace(go.Scatter(
382
+ x=data['timestamp'],
383
+ y=values,
384
+ mode='lines+markers',
385
+ name=f'Voice: {emotion.title()}',
386
+ line=dict(dash='dash', width=2, color=voice_colors.get(emotion, '#000000')),
387
+ marker=dict(size=4, symbol='diamond')
388
+ ))
389
+
390
+ fig.update_layout(
391
+ title='Real-time Patient Emotion Analysis During Consultation',
392
+ xaxis_title='Time (seconds)',
393
+ yaxis_title='Emotion Confidence',
394
+ height=500,
395
+ hovermode='x unified',
396
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
397
+ )
398
+
399
+ return fig
400
+
401
+ def create_emotion_summary(data):
402
+ """Create summary charts of detected emotions"""
403
+ if not data['facial_emotions'] and not data['voice_emotions']:
404
+ return go.Figure(), go.Figure()
405
+
406
+ # Facial emotion summary
407
+ face_fig = go.Figure()
408
+ if data['facial_emotions']:
409
+ face_summary = {}
410
+ for emotions in data['facial_emotions']:
411
+ for emotion, value in emotions.items():
412
+ face_summary[emotion] = face_summary.get(emotion, 0) + value
413
 
414
+ # Only show emotions with significant presence
415
+ significant_emotions = {k: v for k, v in face_summary.items() if v > 0.1}
 
 
 
 
416
 
417
+ if significant_emotions:
418
+ face_fig = px.pie(
419
+ values=list(significant_emotions.values()),
420
+ names=list(significant_emotions.keys()),
421
+ title='Facial Expression Distribution'
422
+ )
423
+ face_fig.update_traces(textposition='inside', textinfo='percent+label')
424
 
425
+ # Voice emotion summary
426
+ voice_fig = go.Figure()
427
+ if data['voice_emotions']:
428
+ voice_summary = {}
429
+ for emotions in data['voice_emotions']:
430
+ for emotion, value in emotions.items():
431
+ voice_summary[emotion] = voice_summary.get(emotion, 0) + value
432
+
433
+ # Only show emotions with significant presence
434
+ significant_emotions = {k: v for k, v in voice_summary.items() if v > 0.1}
435
+
436
+ if significant_emotions:
437
+ voice_fig = px.pie(
438
+ values=list(significant_emotions.values()),
439
+ names=list(significant_emotions.keys()),
440
+ title='Voice Emotion Distribution'
441
+ )
442
+ voice_fig.update_traces(textposition='inside', textinfo='percent+label')
443
+
444
+ return face_fig, voice_fig
445
 
446
+ def generate_clinical_recommendations(data):
447
+ """Generate detailed clinical recommendations based on detected emotions"""
448
+ recommendations = []
449
+ alerts = data.get('alerts', [])
450
+
451
+ if alerts:
452
+ recommendations.append("🚨 **CRITICAL ALERTS DETECTED:**")
453
+ recommendations.append("")
454
+ for alert in alerts[:5]:
455
+ recommendations.append(f"β€’ {alert}")
456
+ recommendations.append("")
457
+
458
+ # Analyze facial emotion patterns
459
+ facial_analysis = {}
460
+ if data.get('facial_emotions'):
461
+ for emotions in data['facial_emotions']:
462
+ for emotion, value in emotions.items():
463
+ facial_analysis[emotion] = facial_analysis.get(emotion, 0) + value
464
+
465
+ total_frames = len(data['facial_emotions'])
466
+ facial_analysis = {k: v/total_frames for k, v in facial_analysis.items()}
467
+
468
+ # Analyze voice emotion patterns
469
+ voice_analysis = {}
470
+ if data.get('voice_emotions'):
471
+ for emotions in data['voice_emotions']:
472
+ for emotion, value in emotions.items():
473
+ voice_analysis[emotion] = voice_analysis.get(emotion, 0) + value
474
+
475
+ total_chunks = len(data['voice_emotions'])
476
+ voice_analysis = {k: v/total_chunks for k, v in voice_analysis.items()}
477
+
478
+ # Generate specific recommendations
479
+ if facial_analysis.get('sad', 0) > 0.3 or voice_analysis.get('sad', 0) > 0.3:
480
+ recommendations.append("😒 **DEPRESSION/SADNESS INDICATORS:**")
481
+ recommendations.append("β€’ Patient shows signs of sadness or low mood")
482
+ recommendations.append("β€’ Consider gentle inquiry about emotional well-being")
483
+ recommendations.append("β€’ Provide emotional support and validation")
484
+ recommendations.append("β€’ Consider referral to mental health services if appropriate")
485
+ recommendations.append("")
486
+
487
+ if facial_analysis.get('fear', 0) > 0.25 or voice_analysis.get('fearful', 0) > 0.25:
488
+ recommendations.append("😰 **ANXIETY/FEAR DETECTION:**")
489
+ recommendations.append("β€’ High anxiety levels detected during consultation")
490
+ recommendations.append("β€’ Explain procedures clearly and provide reassurance")
491
+ recommendations.append("β€’ Allow extra time for questions and concerns")
492
+ recommendations.append("β€’ Consider anxiety management techniques")
493
+ recommendations.append("")
494
+
495
+ if facial_analysis.get('angry', 0) > 0.2 or voice_analysis.get('angry', 0) > 0.2:
496
+ recommendations.append("😠 **FRUSTRATION/ANGER INDICATORS:**")
497
+ recommendations.append("β€’ Patient may be experiencing frustration")
498
+ recommendations.append("β€’ Acknowledge their concerns and validate feelings")
499
+ recommendations.append("β€’ Remain calm and professional")
500
+ recommendations.append("β€’ Address any underlying issues causing frustration")
501
+ recommendations.append("")
502
+
503
+ if voice_analysis.get('calm', 0) > 0.6 and facial_analysis.get('neutral', 0) > 0.4:
504
+ recommendations.append("βœ… **POSITIVE CONSULTATION INDICATORS:**")
505
+ recommendations.append("β€’ Patient appears comfortable and engaged")
506
+ recommendations.append("β€’ Good emotional rapport established")
507
+ recommendations.append("β€’ Continue with current communication approach")
508
+ recommendations.append("")
509
+
510
+ # Overall assessment
511
+ recommendations.append("πŸ“Š **OVERALL EMOTIONAL ASSESSMENT:**")
512
+
513
+ if facial_analysis:
514
+ dominant_facial = max(facial_analysis, key=facial_analysis.get)
515
+ recommendations.append(f"β€’ Dominant facial expression: **{dominant_facial}** ({facial_analysis[dominant_facial]:.1%})")
516
+
517
+ if voice_analysis:
518
+ dominant_voice = max(voice_analysis, key=voice_analysis.get)
519
+ recommendations.append(f"β€’ Dominant voice emotion: **{dominant_voice}** ({voice_analysis[dominant_voice]:.1%})")
520
+
521
+ recommendations.append("")
522
+ recommendations.append("πŸ’‘ **GENERAL RECOMMENDATIONS:**")
523
+ recommendations.append("β€’ Monitor patient comfort throughout consultation")
524
+ recommendations.append("β€’ Adapt communication style based on emotional state")
525
+ recommendations.append("β€’ Document significant emotional observations")
526
+ recommendations.append("β€’ Follow up on any concerning emotional indicators")
527
+
528
+ if not recommendations:
529
+ recommendations.append("βœ… **No significant emotional concerns detected.**")
530
+ recommendations.append("Continue with standard consultation approach.")
531
+
532
+ return "\n".join(recommendations)
533
 
534
+ def process_consultation(video_file, audio_file, progress=gr.Progress()):
535
+ """Main processing function with progress tracking"""
536
+ if video_file is None and audio_file is None:
537
+ return None, None, None, "⚠️ Please upload video and/or audio files to analyze."
538
+
539
+ progress(0.1, desc="Initializing analysis...")
540
+
541
+ # Process the consultation data
542
+ progress(0.3, desc="Processing multimedia data...")
543
+ data = analyzer.process_consultation_data(video_file, audio_file)
544
+
545
+ if not data['timestamp']:
546
+ return None, None, None, "❌ No valid data could be extracted from the uploaded files."
547
+
548
+ progress(0.6, desc="Creating visualizations...")
549
+
550
+ # Create visualizations
551
+ timeline_fig = create_emotion_timeline(data)
552
+ face_summary, voice_summary = create_emotion_summary(data)
553
 
554
+ progress(0.9, desc="Generating recommendations...")
555
 
556
+ # Generate recommendations
557
+ recommendations = generate_clinical_recommendations(data)
 
 
 
 
558
 
559
+ progress(1.0, desc="Analysis complete!")
 
 
 
 
 
 
560
 
561
+ return timeline_fig, face_summary, voice_summary, recommendations
562
 
563
+ def real_time_analysis(audio):
564
+ """Enhanced real-time audio emotion analysis"""
565
+ if audio is None:
566
+ return "🎀 No audio detected - please speak into the microphone"
567
+
568
+ try:
569
+ # Process audio data
570
+ sample_rate, audio_data = audio
571
+
572
+ # Convert to float and normalize
573
+ if audio_data.dtype == np.int16:
574
+ audio_data = audio_data.astype(np.float32) / 32768.0
575
+ elif audio_data.dtype == np.int32:
576
+ audio_data = audio_data.astype(np.float32) / 2147483648.0
577
+
578
+ # Analyze emotions using real model
579
+ emotions = analyzer.analyze_voice_emotion(audio_data, sample_rate)
580
+
581
+ # Format results with better visualization
582
+ result = "🎡 **Real-time Voice Emotion Analysis:**\n\n"
583
+
584
+ # Sort emotions by confidence
585
+ sorted_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True)
586
+
587
+ for emotion, confidence in sorted_emotions:
588
+ percentage = confidence * 100
589
+ bar_length = int(percentage / 5) # Scale bar to percentage
590
+ bar = "β–ˆ" * bar_length + "β–‘" * (20 - bar_length)
591
+
592
+ result += f"**{emotion.title()}**: {percentage:.1f}% `{bar}`\n"
593
+
594
+ # Add clinical alerts
595
+ result += "\n"
596
+ if emotions.get('angry', 0) > 0.4:
597
+ result += "🚨 **ALERT**: High anger/frustration detected\n"
598
+ elif emotions.get('fearful', 0) > 0.4:
599
+ result += "⚠️ **ALERT**: High anxiety/fear detected\n"
600
+ elif emotions.get('sad', 0) > 0.4:
601
+ result += "😒 **ALERT**: Sadness indicators detected\n"
602
+ elif emotions.get('calm', 0) > 0.6:
603
+ result += "βœ… **STATUS**: Patient appears calm and comfortable\n"
604
+
605
+ return result
606
+
607
+ except Exception as e:
608
+ return f"❌ Error processing audio: {str(e)}\n\nPlease ensure your microphone is working and try again."
609
+
610
+ # Create enhanced Gradio interface
611
+ with gr.Blocks(title="Advanced Patient Emotion Analysis System", theme=gr.themes.Soft()) as demo:
612
+ gr.Markdown("""
613
+ # πŸ₯ Advanced Patient Emotion Analysis System
614
+ ### Real AI-Powered Facial & Voice Emotion Recognition
615
+
616
+ This system uses **real deep learning models** to analyze patient emotions during medical consultations:
617
+ - **Facial Expression Analysis**: 7-emotion CNN model (angry, disgust, fear, happy, neutral, sad, surprise)
618
+ - **Voice Emotion Recognition**: LSTM-based model analyzing audio features
619
+ - **Real-time Monitoring**: Live emotion detection during consultations
620
+ - **Clinical Recommendations**: AI-generated insights for healthcare practitioners
621
+
622
+ πŸ”¬ **Technology Stack**: PyTorch, dlib, librosa, computer vision, deep learning
623
+ """)
624
+
625
+ with gr.Tabs():
626
+ # Main Analysis Tab
627
+ with gr.Tab("🎬 Consultation Analysis", elem_id="main-tab"):
628
+ gr.Markdown("### Upload consultation recordings for comprehensive AI-powered emotion analysis")
629
+
630
+ with gr.Row():
631
+ with gr.Column(scale=1):
632
+ video_input = gr.File(
633
+ label="πŸ“Ή Upload Video Recording",
634
+ file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
635
+ type="filepath"
636
+ )
637
+ audio_input = gr.File(
638
+ label="🎡 Upload Audio Recording",
639
+ file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
640
+ type="filepath"
641
+ )
642
+ analyze_btn = gr.Button(
643
+ "πŸ” Analyze with AI Models",
644
+ variant="primary",
645
+ size="lg",
646
+ scale=1
647
+ )
648
+
649
+ with gr.Column(scale=2):
650
+ recommendations_output = gr.Markdown(
651
+ label="🩺 Clinical Recommendations",
652
+ value="Upload files and click 'Analyze' to get AI-powered clinical insights..."
653
+ )
654
+
655
+ with gr.Row():
656
+ timeline_plot = gr.Plot(label="πŸ“ˆ Emotion Timeline Analysis", height=500)
657
+
658
+ with gr.Row():
659
+ with gr.Column():
660
+ face_summary_plot = gr.Plot(label="😊 Facial Expression Summary")
661
+ with gr.Column():
662
+ voice_summary_plot = gr.Plot(label="🎀 Voice Emotion Summary")
663
+
664
+ analyze_btn.click(
665
+ fn=process_consultation,
666
+ inputs=[video_input, audio_input],
667
+ outputs=[timeline_plot, face_summary_plot, voice_summary_plot, recommendations_output],
668
+ show_progress=True
669
  )
670
+
671
+ # Real-time Tab
672
+ with gr.Tab("πŸŽ™οΈ Real-time Monitoring"):
673
+ gr.Markdown("""
674
+ ### Live voice emotion analysis during consultation
675
+ *Click the microphone button and speak to see real-time emotion detection*
676
+ """)
677
+
678
+ with gr.Row():
679
+ with gr.Column(scale=1):
680
+ audio_realtime = gr.Audio(
681
+ sources=["microphone"],
682
+ type="numpy",
683
+ label="🎀 Live Audio Input",
684
+ streaming=False
685
+ )
686
+
687
+ with gr.Column(scale=2):
688
+ realtime_output = gr.Markdown(
689
+ label="πŸ“Š Real-time Analysis Results",
690
+ value="🎀 **Ready for real-time analysis**\n\nClick the microphone and speak to see live emotion detection using our AI models."
691
+ )
692
+
693
+ audio_realtime.change(
694
+ fn=real_time_analysis,
695
+ inputs=[audio_realtime],
696
+ outputs=[realtime_output]
697
  )
698
+
699
+ # Technical Details Tab
700
+ with gr.Tab("πŸ”¬ Model & Technical Information"):
701
+ gr.Markdown(f"""
702
+ ### AI Models & Architecture
703
+
704
+ **Current System Status:**
705
+ - πŸ–₯️ **Processing Device**: {analyzer.device}
706
+ - 🧠 **Facial Model**: Custom CNN (7 emotions)
707
+ - 🎡 **Voice Model**: LSTM-based architecture (6 emotions)
708
+ - πŸ‘οΈ **Face Detection**: dlib frontal face detector
709
+ - πŸ“Š **Audio Features**: MFCC, spectral features, pitch analysis
710
+
711
+ ---