yunusajib commited on
Commit
d287980
·
verified ·
1 Parent(s): fa1b021
Files changed (1) hide show
  1. app.py +147 -459
app.py CHANGED
@@ -9,50 +9,50 @@ import queue
9
  import time
10
  from collections import deque
11
  import warnings
 
12
  warnings.filterwarnings("ignore")
13
 
14
- # Try to import OpenCV with fallback
15
- try:
16
- import cv2
17
- CV2_AVAILABLE = True
18
- except ImportError:
19
- CV2_AVAILABLE = False
20
- print("OpenCV not available - using PIL for image processing")
21
-
22
- # Try to import librosa with fallback
23
  try:
 
24
  import librosa
25
  LIBROSA_AVAILABLE = True
26
  except ImportError:
27
  LIBROSA_AVAILABLE = False
28
  print("Librosa not available - using basic audio processing")
29
 
30
- # Try to import transformers and torch, with fallbacks
 
31
  try:
32
- from transformers import pipeline
33
- import torch
34
- HF_AVAILABLE = True
35
  except ImportError:
36
- HF_AVAILABLE = False
37
- print("Transformers not available - using mock emotion detection")
38
 
39
- # Additional imports for image processing if OpenCV fails
40
  try:
41
  from PIL import Image, ImageDraw, ImageFont
42
  PIL_AVAILABLE = True
43
  except ImportError:
44
  PIL_AVAILABLE = False
 
 
 
 
 
 
 
 
 
 
45
 
46
  class EmotionRecognitionSystem:
47
  def __init__(self):
48
- self.emotion_history = deque(maxlen=100) # Store last 100 emotion readings
49
  self.audio_queue = queue.Queue()
50
  self.video_queue = queue.Queue()
51
-
52
- # Initialize emotion detection models
53
  self.setup_models()
54
 
55
- # Emotion thresholds for alerts
56
  self.alert_thresholds = {
57
  'stress': 0.7,
58
  'anxiety': 0.6,
@@ -61,31 +61,77 @@ class EmotionRecognitionSystem:
61
  }
62
 
63
  def setup_models(self):
64
- """Initialize emotion recognition models"""
65
- if HF_AVAILABLE:
66
- try:
67
- # Facial emotion recognition
68
- self.face_emotion_pipeline = pipeline(
69
- "image-classification",
70
- model="j-hartmann/emotion-english-distilroberta-base",
71
- device=0 if torch.cuda.is_available() else -1
72
- )
73
-
74
- # Audio emotion recognition
75
- self.audio_emotion_pipeline = pipeline(
76
- "audio-classification",
77
- model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
78
- device=0 if torch.cuda.is_available() else -1
79
- )
80
- self.models_loaded = True
81
- except Exception as e:
82
- print(f"Error loading models: {e}")
83
- self.models_loaded = False
84
- else:
 
 
 
 
85
  self.models_loaded = False
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  def detect_face_emotion(self, frame):
88
- """Detect emotions from facial expressions"""
89
  if not self.models_loaded:
90
  # Mock emotion detection for demo
91
  emotions = ['neutral', 'happy', 'sad', 'angry', 'fear', 'surprise', 'disgust']
@@ -93,20 +139,24 @@ class EmotionRecognitionSystem:
93
  return dict(zip(emotions, scores))
94
 
95
  try:
96
- # Handle different image formats
97
  if isinstance(frame, np.ndarray):
98
- if CV2_AVAILABLE:
99
- # Convert frame to RGB if it's BGR
100
- if len(frame.shape) == 3 and frame.shape[2] == 3:
101
- rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 
 
 
 
102
  else:
103
  rgb_frame = frame
104
  else:
105
- # Use numpy operations for color conversion
106
- if len(frame.shape) == 3 and frame.shape[2] == 3:
107
- rgb_frame = frame[:, :, ::-1] # BGR to RGB
108
  else:
109
- rgb_frame = frame
110
  else:
111
  rgb_frame = frame
112
 
@@ -124,8 +174,8 @@ class EmotionRecognitionSystem:
124
  print(f"Face emotion detection error: {e}")
125
  return {'neutral': 1.0}
126
 
127
- def detect_voice_emotion(self, audio_data, sample_rate=16000):
128
- """Detect emotions from voice tone"""
129
  if not self.models_loaded or audio_data is None:
130
  # Mock emotion detection
131
  emotions = ['neutral', 'happy', 'sad', 'angry', 'fear']
@@ -133,8 +183,18 @@ class EmotionRecognitionSystem:
133
  return dict(zip(emotions, scores))
134
 
135
  try:
 
 
 
 
 
 
 
136
  # Process audio with the model
137
- results = self.audio_emotion_pipeline(audio_data)
 
 
 
138
 
139
  emotion_scores = {}
140
  for result in results:
@@ -146,431 +206,59 @@ class EmotionRecognitionSystem:
146
  print(f"Voice emotion detection error: {e}")
147
  return {'neutral': 1.0}
148
 
149
- def extract_audio_features(self, audio_data, sample_rate):
150
- """Extract audio features for emotion analysis"""
151
- if not LIBROSA_AVAILABLE:
152
- # Return mock features if librosa is not available
153
- return {
154
- 'mfcc_mean': np.random.random(),
155
- 'mfcc_std': np.random.random(),
156
- 'spectral_centroid_mean': np.random.random(),
157
- 'zcr_mean': np.random.random(),
158
- 'spectral_rolloff_mean': np.random.random()
159
- }
160
-
161
- try:
162
- # Extract basic audio features
163
- mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
164
- spectral_centroids = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate)
165
- zero_crossing_rate = librosa.feature.zero_crossing_rate(audio_data)
166
- spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_data, sr=sample_rate)
167
-
168
- features = {
169
- 'mfcc_mean': np.mean(mfccs),
170
- 'mfcc_std': np.std(mfccs),
171
- 'spectral_centroid_mean': np.mean(spectral_centroids),
172
- 'zcr_mean': np.mean(zero_crossing_rate),
173
- 'spectral_rolloff_mean': np.mean(spectral_rolloff)
174
- }
175
-
176
- return features
177
- except Exception as e:
178
- print(f"Audio feature extraction error: {e}")
179
- return {}
180
-
181
- def combine_emotions(self, face_emotions, voice_emotions, weights=(0.6, 0.4)):
182
- """Combine facial and voice emotion predictions"""
183
- combined = {}
184
- all_emotions = set(face_emotions.keys()) | set(voice_emotions.keys())
185
-
186
- for emotion in all_emotions:
187
- face_score = face_emotions.get(emotion, 0)
188
- voice_score = voice_emotions.get(emotion, 0)
189
- combined[emotion] = weights[0] * face_score + weights[1] * voice_score
190
-
191
- return combined
192
-
193
- def map_to_clinical_emotions(self, emotions):
194
- """Map detected emotions to clinical categories"""
195
- clinical_mapping = {
196
- 'stress': emotions.get('angry', 0) * 0.3 + emotions.get('fear', 0) * 0.4 + emotions.get('disgust', 0) * 0.3,
197
- 'anxiety': emotions.get('fear', 0) * 0.6 + emotions.get('surprise', 0) * 0.2 + emotions.get('sad', 0) * 0.2,
198
- 'pain': emotions.get('angry', 0) * 0.4 + emotions.get('disgust', 0) * 0.3 + emotions.get('sad', 0) * 0.3,
199
- 'confusion': emotions.get('surprise', 0) * 0.5 + emotions.get('neutral', 0) * 0.3 + emotions.get('fear', 0) * 0.2,
200
- 'comfort': emotions.get('happy', 0) * 0.7 + emotions.get('neutral', 0) * 0.3
201
- }
202
-
203
- return clinical_mapping
204
-
205
- def generate_alerts(self, clinical_emotions):
206
- """Generate alerts based on emotion thresholds"""
207
- alerts = []
208
- suggestions = []
209
-
210
- for emotion, score in clinical_emotions.items():
211
- if emotion in self.alert_thresholds and score > self.alert_thresholds[emotion]:
212
- alerts.append(f"⚠️ High {emotion} detected ({score:.2f})")
213
-
214
- # Add specific suggestions
215
- if emotion == 'stress':
216
- suggestions.append("Consider: Take a moment to slow down, use calming voice tone")
217
- elif emotion == 'anxiety':
218
- suggestions.append("Consider: Provide reassurance, explain procedures clearly")
219
- elif emotion == 'pain':
220
- suggestions.append("Consider: Assess pain level, offer comfort measures")
221
- elif emotion == 'confusion':
222
- suggestions.append("Consider: Simplify explanations, check understanding")
223
-
224
- return alerts, suggestions
225
-
226
- def process_frame(self, frame, audio_data=None, sample_rate=16000):
227
- """Process a single frame and audio data"""
228
- timestamp = datetime.now()
229
-
230
- # Detect emotions
231
- face_emotions = self.detect_face_emotion(frame)
232
- voice_emotions = self.detect_voice_emotion(audio_data, sample_rate) if audio_data is not None else {}
233
-
234
- # Combine emotions
235
- if voice_emotions:
236
- combined_emotions = self.combine_emotions(face_emotions, voice_emotions)
237
- else:
238
- combined_emotions = face_emotions
239
-
240
- # Map to clinical categories
241
- clinical_emotions = self.map_to_clinical_emotions(combined_emotions)
242
-
243
- # Generate alerts
244
- alerts, suggestions = self.generate_alerts(clinical_emotions)
245
-
246
- # Store in history
247
- emotion_record = {
248
- 'timestamp': timestamp,
249
- 'face_emotions': face_emotions,
250
- 'voice_emotions': voice_emotions,
251
- 'clinical_emotions': clinical_emotions,
252
- 'alerts': alerts,
253
- 'suggestions': suggestions
254
- }
255
-
256
- self.emotion_history.append(emotion_record)
257
-
258
- return emotion_record
259
-
260
- # Initialize the emotion recognition system
261
- emotion_system = EmotionRecognitionSystem()
262
 
263
  def process_video_audio(video_frame, audio_data):
264
- """Process video frame and audio data"""
265
  if video_frame is None:
266
  return None, "No video input", "", ""
267
 
268
- # Process the frame
269
- sample_rate = 16000
270
- if audio_data is not None:
271
- audio_array, sr = audio_data
272
- if LIBROSA_AVAILABLE and sr != sample_rate:
273
- audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=sample_rate)
274
- elif not LIBROSA_AVAILABLE:
275
- # Simple resampling if librosa not available
276
- if sr != sample_rate:
277
- # Basic downsampling
278
- step = sr // sample_rate
279
- audio_array = audio_array[::step] if step > 1 else audio_array
280
- else:
281
- audio_array = None
282
-
283
- # Get emotion analysis
284
- emotion_record = emotion_system.process_frame(video_frame, audio_array, sample_rate)
285
-
286
- # Create visualization
287
- annotated_frame = create_emotion_overlay(video_frame, emotion_record)
288
-
289
- # Format results
290
- clinical_text = format_clinical_emotions(emotion_record['clinical_emotions'])
291
- alerts_text = "\n".join(emotion_record['alerts']) if emotion_record['alerts'] else "No alerts"
292
- suggestions_text = "\n".join(emotion_record['suggestions']) if emotion_record['suggestions'] else "No suggestions"
293
-
294
- return annotated_frame, clinical_text, alerts_text, suggestions_text
295
-
296
- def create_emotion_overlay(frame, emotion_record):
297
- """Add emotion information overlay to video frame"""
298
  try:
299
- if CV2_AVAILABLE:
300
- annotated_frame = frame.copy()
301
-
302
- # Get top emotion
303
- clinical_emotions = emotion_record['clinical_emotions']
304
- top_emotion = max(clinical_emotions.items(), key=lambda x: x[1])
305
-
306
- # Add text overlay
307
- cv2.putText(annotated_frame, f"Primary: {top_emotion[0]} ({top_emotion[1]:.2f})",
308
- (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
309
-
310
- # Add alert indicator
311
- if emotion_record['alerts']:
312
- cv2.putText(annotated_frame, "ALERT!", (10, 60),
313
- cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
314
-
315
- return annotated_frame
316
 
317
- elif PIL_AVAILABLE:
318
- # Use PIL for image annotation
319
- pil_image = Image.fromarray(frame)
320
- draw = ImageDraw.Draw(pil_image)
321
-
322
- # Get top emotion
323
- clinical_emotions = emotion_record['clinical_emotions']
324
- top_emotion = max(clinical_emotions.items(), key=lambda x: x[1])
325
-
326
- # Add text overlay
327
- try:
328
- font = ImageFont.load_default()
329
- except:
330
- font = None
331
-
332
- text = f"Primary: {top_emotion[0]} ({top_emotion[1]:.2f})"
333
- draw.text((10, 10), text, fill=(0, 255, 0), font=font)
334
-
335
- # Add alert indicator
336
- if emotion_record['alerts']:
337
- draw.text((10, 40), "ALERT!", fill=(255, 0, 0), font=font)
338
-
339
- return np.array(pil_image)
340
 
341
- else:
342
- # Return original frame if no image processing available
343
- return frame
344
-
345
  except Exception as e:
346
- print(f"Error creating emotion overlay: {e}")
347
- return frame
 
348
 
349
- def format_clinical_emotions(clinical_emotions):
350
- """Format clinical emotions for display"""
351
- formatted = []
352
- for emotion, score in clinical_emotions.items():
353
- bar = "█" * int(score * 10)
354
- formatted.append(f"{emotion.capitalize()}: {bar} {score:.3f}")
355
- return "\n".join(formatted)
356
 
357
- def create_emotion_timeline():
358
- """Create emotion timeline chart"""
359
- if not emotion_system.emotion_history:
360
- return create_empty_chart()
361
-
362
- # Extract data for plotting
363
- timestamps = [record['timestamp'] for record in emotion_system.emotion_history]
364
-
365
- fig = go.Figure()
366
-
367
- # Add traces for each clinical emotion
368
- clinical_emotions = ['stress', 'anxiety', 'pain', 'confusion', 'comfort']
369
- colors = ['red', 'orange', 'purple', 'brown', 'green']
370
-
371
- for emotion, color in zip(clinical_emotions, colors):
372
- values = [record['clinical_emotions'].get(emotion, 0) for record in emotion_system.emotion_history]
373
- fig.add_trace(go.Scatter(
374
- x=timestamps,
375
- y=values,
376
- mode='lines+markers',
377
- name=emotion.capitalize(),
378
- line=dict(color=color, width=2),
379
- marker=dict(size=4)
380
- ))
381
-
382
- fig.update_layout(
383
- title="Patient Emotion Timeline",
384
- xaxis_title="Time",
385
- yaxis_title="Emotion Intensity",
386
- height=400,
387
- showlegend=True,
388
- template="plotly_white"
389
- )
390
-
391
- return fig
392
-
393
- def create_empty_chart():
394
- """Create empty chart when no data available"""
395
- fig = go.Figure()
396
- fig.add_annotation(
397
- text="No emotion data available yet",
398
- xref="paper", yref="paper",
399
- x=0.5, y=0.5, xanchor='center', yanchor='middle',
400
- showarrow=False, font=dict(size=16)
401
- )
402
- fig.update_layout(
403
- title="Patient Emotion Timeline",
404
- height=400,
405
- template="plotly_white"
406
- )
407
- return fig
408
-
409
- def get_session_summary():
410
- """Generate session summary"""
411
- if not emotion_system.emotion_history:
412
- return "No session data available"
413
-
414
- # Calculate averages
415
- avg_emotions = {}
416
- total_alerts = 0
417
-
418
- for emotion in ['stress', 'anxiety', 'pain', 'confusion', 'comfort']:
419
- values = [record['clinical_emotions'].get(emotion, 0) for record in emotion_system.emotion_history]
420
- avg_emotions[emotion] = np.mean(values) if values else 0
421
-
422
- total_alerts = sum(len(record['alerts']) for record in emotion_system.emotion_history)
423
-
424
- # Format summary
425
- summary = f"""
426
- Session Summary:
427
- - Duration: {len(emotion_system.emotion_history)} readings
428
- - Average Stress Level: {avg_emotions['stress']:.3f}
429
- - Average Anxiety Level: {avg_emotions['anxiety']:.3f}
430
- - Average Pain Level: {avg_emotions['pain']:.3f}
431
- - Average Confusion Level: {avg_emotions['confusion']:.3f}
432
- - Average Comfort Level: {avg_emotions['comfort']:.3f}
433
- - Total Alerts: {total_alerts}
434
-
435
- Recommendations:
436
- - Monitor stress levels during consultation
437
- - Ensure patient understanding and comfort
438
- - Address any recurring high emotion levels
439
- """
440
-
441
- return summary
442
-
443
- def clear_session():
444
- """Clear session data"""
445
- emotion_system.emotion_history.clear()
446
- return "Session data cleared", create_empty_chart(), ""
447
-
448
- # Create Gradio interface
449
  def create_interface():
450
  with gr.Blocks(title="Patient Emotion Recognition System", theme=gr.themes.Soft()) as demo:
451
- gr.Markdown("""
452
- # 🏥 Real-Time Patient Emotion Recognition System
453
-
454
- This system analyzes patient facial expressions and voice tone during consultations to detect emotions such as stress, anxiety, confusion, or pain.
455
- """)
456
-
457
- with gr.Row():
458
- with gr.Column(scale=2):
459
- gr.Markdown("### 📹 Live Analysis")
460
-
461
- # Video input
462
- video_input = gr.Video(
463
- label="Video Feed",
464
- sources=["webcam"],
465
- streaming=True
466
- )
467
-
468
- # Audio input
469
- audio_input = gr.Audio(
470
- label="Audio Input",
471
- sources=["microphone"],
472
- type="numpy",
473
- streaming=True
474
- )
475
-
476
- # Process button
477
- process_btn = gr.Button("🔄 Process Current Frame", variant="primary")
478
-
479
- with gr.Column(scale=2):
480
- gr.Markdown("### 📊 Real-Time Results")
481
-
482
- # Annotated video output
483
- video_output = gr.Image(
484
- label="Emotion Analysis",
485
- type="numpy"
486
- )
487
-
488
- # Clinical emotions display
489
- clinical_output = gr.Textbox(
490
- label="Clinical Emotion Levels",
491
- lines=6,
492
- interactive=False
493
- )
494
-
495
- with gr.Row():
496
- with gr.Column():
497
- gr.Markdown("### ⚠️ Alerts")
498
- alerts_output = gr.Textbox(
499
- label="Current Alerts",
500
- lines=3,
501
- interactive=False
502
- )
503
-
504
- with gr.Column():
505
- gr.Markdown("### 💡 Suggestions")
506
- suggestions_output = gr.Textbox(
507
- label="Practitioner Suggestions",
508
- lines=3,
509
- interactive=False
510
- )
511
-
512
- with gr.Row():
513
- gr.Markdown("### 📈 Emotion Timeline")
514
- timeline_plot = gr.Plot(label="Emotion Timeline")
515
-
516
- with gr.Row():
517
- with gr.Column():
518
- gr.Markdown("### 📋 Session Summary")
519
- summary_output = gr.Textbox(
520
- label="Session Summary",
521
- lines=12,
522
- interactive=False
523
- )
524
-
525
- with gr.Row():
526
- update_summary_btn = gr.Button("📊 Update Summary")
527
- clear_btn = gr.Button("🗑️ Clear Session", variant="secondary")
528
- update_timeline_btn = gr.Button("🔄 Update Timeline")
529
-
530
- # Event handlers
531
- process_btn.click(
532
- fn=process_video_audio,
533
- inputs=[video_input, audio_input],
534
- outputs=[video_output, clinical_output, alerts_output, suggestions_output]
535
- )
536
-
537
- update_timeline_btn.click(
538
- fn=create_emotion_timeline,
539
- outputs=timeline_plot
540
- )
541
-
542
- update_summary_btn.click(
543
- fn=get_session_summary,
544
- outputs=summary_output
545
- )
546
-
547
- clear_btn.click(
548
- fn=clear_session,
549
- outputs=[summary_output, timeline_plot, clinical_output]
550
- )
551
-
552
- # Auto-update timeline every few seconds
553
- demo.load(fn=create_emotion_timeline, outputs=timeline_plot)
554
 
 
555
  gr.Markdown("""
556
- ### 📝 Usage Instructions:
557
- 1. **Enable camera and microphone** access when prompted
558
- 2. **Click "Process Current Frame"** to analyze emotions in real-time
559
- 3. **Monitor the timeline** to track emotion changes over time
560
- 4. **Review alerts and suggestions** for patient care recommendations
561
- 5. **Use session summary** for consultation documentation
562
-
563
- ### 🔧 Technical Notes:
564
- - System uses pre-trained emotion recognition models
565
- - Combines facial expression and voice tone analysis
566
- - Provides clinical emotion mapping (stress, anxiety, pain, confusion)
567
- - Generates real-time alerts and suggestions for practitioners
568
  """)
569
 
570
  return demo
571
 
572
- # Launch the application
573
  if __name__ == "__main__":
 
574
  demo = create_interface()
575
  demo.launch(
576
  share=True,
 
9
  import time
10
  from collections import deque
11
  import warnings
12
+ import traceback
13
  warnings.filterwarnings("ignore")
14
 
15
+ # Audio processing imports with fallbacks
16
+ AUDIO_AVAILABLE = True
 
 
 
 
 
 
 
17
  try:
18
+ import soundfile as sf
19
  import librosa
20
  LIBROSA_AVAILABLE = True
21
  except ImportError:
22
  LIBROSA_AVAILABLE = False
23
  print("Librosa not available - using basic audio processing")
24
 
25
+ # Image processing imports with fallbacks
26
+ CV2_AVAILABLE = True
27
  try:
28
+ import cv2
 
 
29
  except ImportError:
30
+ CV2_AVAILABLE = False
31
+ print("OpenCV not available - using PIL for image processing")
32
 
 
33
  try:
34
  from PIL import Image, ImageDraw, ImageFont
35
  PIL_AVAILABLE = True
36
  except ImportError:
37
  PIL_AVAILABLE = False
38
+ print("PIL not available - limited image processing")
39
+
40
+ # AI model imports with fallbacks
41
+ HF_AVAILABLE = True
42
+ try:
43
+ from transformers import pipeline
44
+ import torch
45
+ except ImportError:
46
+ HF_AVAILABLE = False
47
+ print("Transformers not available - using mock emotion detection")
48
 
49
  class EmotionRecognitionSystem:
50
  def __init__(self):
51
+ self.emotion_history = deque(maxlen=100)
52
  self.audio_queue = queue.Queue()
53
  self.video_queue = queue.Queue()
 
 
54
  self.setup_models()
55
 
 
56
  self.alert_thresholds = {
57
  'stress': 0.7,
58
  'anxiety': 0.6,
 
61
  }
62
 
63
  def setup_models(self):
64
+ """Initialize emotion recognition models with better error handling"""
65
+ self.models_loaded = False
66
+
67
+ if not HF_AVAILABLE:
68
+ print("Skipping model loading - transformers not available")
69
+ return
70
+
71
+ try:
72
+ # Facial emotion recognition
73
+ self.face_emotion_pipeline = pipeline(
74
+ "image-classification",
75
+ model="j-hartmann/emotion-english-distilroberta-base",
76
+ device=0 if torch.cuda.is_available() else -1
77
+ )
78
+
79
+ # Audio emotion recognition
80
+ self.audio_emotion_pipeline = pipeline(
81
+ "audio-classification",
82
+ model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
83
+ device=0 if torch.cuda.is_available() else -1
84
+ )
85
+ self.models_loaded = True
86
+ except Exception as e:
87
+ print(f"Error loading models: {e}")
88
+ print(traceback.format_exc())
89
  self.models_loaded = False
90
 
91
+ def validate_audio_input(self, audio_data):
92
+ """Validate and standardize audio input format"""
93
+ if audio_data is None:
94
+ return None
95
+
96
+ try:
97
+ # Handle different audio input formats
98
+ if isinstance(audio_data, tuple):
99
+ audio_array, sample_rate = audio_data
100
+ else:
101
+ # Try to read audio file if not in tuple format
102
+ if isinstance(audio_data, str):
103
+ if LIBROSA_AVAILABLE:
104
+ audio_array, sample_rate = librosa.load(audio_data, sr=None)
105
+ else:
106
+ # Fallback for when librosa is not available
107
+ import wave
108
+ with wave.open(audio_data, 'rb') as wf:
109
+ sample_rate = wf.getframerate()
110
+ audio_array = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)
111
+ audio_array = audio_array.astype(np.float32) / 32768.0
112
+ else:
113
+ return None
114
+
115
+ # Resample if needed
116
+ target_rate = 16000
117
+ if sample_rate != target_rate:
118
+ if LIBROSA_AVAILABLE:
119
+ audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=target_rate)
120
+ else:
121
+ # Simple downsampling fallback
122
+ step = int(sample_rate / target_rate)
123
+ if step > 1:
124
+ audio_array = audio_array[::step]
125
+ sample_rate = target_rate
126
+
127
+ return (audio_array, sample_rate)
128
+
129
+ except Exception as e:
130
+ print(f"Audio validation error: {e}")
131
+ return None
132
+
133
  def detect_face_emotion(self, frame):
134
+ """Detect emotions from facial expressions with better error handling"""
135
  if not self.models_loaded:
136
  # Mock emotion detection for demo
137
  emotions = ['neutral', 'happy', 'sad', 'angry', 'fear', 'surprise', 'disgust']
 
139
  return dict(zip(emotions, scores))
140
 
141
  try:
142
+ # Convert frame to RGB format
143
  if isinstance(frame, np.ndarray):
144
+ if len(frame.shape) == 3:
145
+ if frame.shape[2] == 4: # RGBA
146
+ rgb_frame = frame[:, :, :3]
147
+ elif frame.shape[2] == 3: # BGR or RGB?
148
+ if CV2_AVAILABLE:
149
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
150
+ else:
151
+ rgb_frame = frame[:, :, ::-1] # Simple BGR to RGB
152
  else:
153
  rgb_frame = frame
154
  else:
155
+ # Grayscale to RGB
156
+ if CV2_AVAILABLE:
157
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
158
  else:
159
+ rgb_frame = np.stack((frame,)*3, axis=-1)
160
  else:
161
  rgb_frame = frame
162
 
 
174
  print(f"Face emotion detection error: {e}")
175
  return {'neutral': 1.0}
176
 
177
+ def detect_voice_emotion(self, audio_data):
178
+ """Detect emotions from voice tone with better audio handling"""
179
  if not self.models_loaded or audio_data is None:
180
  # Mock emotion detection
181
  emotions = ['neutral', 'happy', 'sad', 'angry', 'fear']
 
183
  return dict(zip(emotions, scores))
184
 
185
  try:
186
+ # Validate and standardize audio input
187
+ validated_audio = self.validate_audio_input(audio_data)
188
+ if validated_audio is None:
189
+ return {'neutral': 1.0}
190
+
191
+ audio_array, sample_rate = validated_audio
192
+
193
  # Process audio with the model
194
+ results = self.audio_emotion_pipeline({
195
+ "array": audio_array,
196
+ "sampling_rate": sample_rate
197
+ })
198
 
199
  emotion_scores = {}
200
  for result in results:
 
206
  print(f"Voice emotion detection error: {e}")
207
  return {'neutral': 1.0}
208
 
209
+ # [Rest of your existing methods...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  def process_video_audio(video_frame, audio_data):
212
+ """Process video frame and audio data with better error handling"""
213
  if video_frame is None:
214
  return None, "No video input", "", ""
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  try:
217
+ # Process the frame
218
+ validated_audio = emotion_system.validate_audio_input(audio_data)
219
+
220
+ # Get emotion analysis
221
+ emotion_record = emotion_system.process_frame(
222
+ video_frame,
223
+ validated_audio[0] if validated_audio else None,
224
+ validated_audio[1] if validated_audio else 16000
225
+ )
 
 
 
 
 
 
 
 
226
 
227
+ # Create visualization
228
+ annotated_frame = create_emotion_overlay(video_frame, emotion_record)
229
+
230
+ # Format results
231
+ clinical_text = format_clinical_emotions(emotion_record['clinical_emotions'])
232
+ alerts_text = "\n".join(emotion_record['alerts']) if emotion_record['alerts'] else "No alerts"
233
+ suggestions_text = "\n".join(emotion_record['suggestions']) if emotion_record['suggestions'] else "No suggestions"
234
+
235
+ return annotated_frame, clinical_text, alerts_text, suggestions_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
 
 
 
 
237
  except Exception as e:
238
+ print(f"Processing error: {e}")
239
+ traceback.print_exc()
240
+ return video_frame, "Processing error", "System error", "Please try again"
241
 
242
+ # [Rest of your existing functions...]
 
 
 
 
 
 
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  def create_interface():
245
  with gr.Blocks(title="Patient Emotion Recognition System", theme=gr.themes.Soft()) as demo:
246
+ # [Your existing interface code...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ # Add audio format info
249
  gr.Markdown("""
250
+ ### 🔊 Audio Input Notes:
251
+ - System works best with clear microphone input
252
+ - If you get audio errors, try:
253
+ - Checking microphone permissions
254
+ - Reducing background noise
255
+ - Using a different microphone
 
 
 
 
 
 
256
  """)
257
 
258
  return demo
259
 
 
260
  if __name__ == "__main__":
261
+ emotion_system = EmotionRecognitionSystem()
262
  demo = create_interface()
263
  demo.launch(
264
  share=True,