yunusajib commited on
Commit
ef2be41
·
verified ·
1 Parent(s): 22bda4b

app update

Browse files
Files changed (1) hide show
  1. app.py +92 -330
app.py CHANGED
@@ -1,350 +1,112 @@
1
- import gradio as gr
2
- import numpy as np
3
  import cv2
4
- import pandas as pd
5
- from datetime import datetime
6
- import time
7
- import librosa
8
- import joblib
9
- from python_speech_features import mfcc
10
  import onnxruntime as ort
11
- import requests
12
- import os
 
 
13
  from sklearn.preprocessing import StandardScaler
 
 
14
 
15
- # Constants - Updated with alternative model sources
16
- MODEL_URLS = [
17
- "https://github.com/onnx/models/raw/main/vision/body_analysis/emotion_ferplus/model/emotion-ferplus-8.onnx",
18
- "https://www.dropbox.com/s/7mswy6h0k3f8ydo/emotion-ferplus-8.onnx?dl=1"
19
- ]
20
- MODEL_PATH = "emotion-ferplus-8.onnx"
21
- VOICE_MODEL_PATH = "voice_emotion_model.pkl"
22
- VOICE_SCALER_PATH = "voice_scaler.pkl"
23
 
24
- class EmotionModel:
25
- def __init__(self):
26
- self.session = None
27
- self.labels = ['neutral', 'happy', 'surprise', 'sad', 'angry', 'disgust', 'fear', 'contempt']
28
- self.emotion_buffer = []
29
- self.load_model()
30
-
31
- def download_model(self):
32
- for url in MODEL_URLS:
33
- try:
34
- print(f"Attempting to download model from: {url}")
35
- response = requests.get(url, stream=True, timeout=30)
36
- response.raise_for_status()
37
-
38
- with open(MODEL_PATH, "wb") as f:
39
- for chunk in response.iter_content(chunk_size=8192):
40
- if chunk:
41
- f.write(chunk)
42
-
43
- if os.path.exists(MODEL_PATH):
44
- print(f"Successfully downloaded model from {url}")
45
- return True
46
- except Exception as e:
47
- print(f"Download attempt failed from {url}: {str(e)}")
48
-
49
- return False
50
-
51
- def load_model(self):
52
- if not os.path.exists(MODEL_PATH):
53
- if not self.download_model():
54
- print("Warning: Could not download emotion model. Using simple face detection only.")
55
- self.session = None
56
- return
57
-
58
- try:
59
- so = ort.SessionOptions()
60
- so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
61
- self.session = ort.InferenceSession(MODEL_PATH, so)
62
- print("Emotion model loaded successfully")
63
- except Exception as e:
64
- print(f"Failed to load ONNX model: {str(e)}")
65
- self.session = None
66
-
67
- def softmax(self, x):
68
- e_x = np.exp(x - np.max(x))
69
- return e_x / e_x.sum()
70
-
71
- def predict(self, frame):
72
- if self.session is None:
73
- # Return dummy probabilities if model failed to load
74
- base = np.array([0.7] + [0.1]*7)
75
- variation = np.random.normal(0, 0.01, size=8)
76
- return [np.clip(base + variation, 0, 1).reshape(1, -1)]
77
-
78
- try:
79
- raw_prediction = self.session.run(None, {'Input3': frame})[0][0]
80
- self.emotion_buffer.append(raw_prediction)
81
-
82
- if len(self.emotion_buffer) > 5:
83
- self.emotion_buffer = self.emotion_buffer[-5:]
84
-
85
- smoothed_probs = np.mean(self.emotion_buffer, axis=0)
86
- return self.softmax(smoothed_probs).reshape(1, -1)
87
- except Exception as e:
88
- print(f"Prediction error: {str(e)}")
89
- return [np.array([[0.8] + [0.1]*7])] # Mostly neutral fallback
90
 
91
- class VoiceEmotionClassifier:
92
- def __init__(self):
93
- try:
94
- if os.path.exists(VOICE_MODEL_PATH) and os.path.exists(VOICE_SCALER_PATH):
95
- self.model = joblib.load(VOICE_MODEL_PATH)
96
- self.scaler = joblib.load(VOICE_SCALER_PATH)
97
- self.labels = ['neutral', 'happy', 'sad', 'angry', 'fear']
98
- print("Loaded pretrained voice emotion model")
99
- else:
100
- raise FileNotFoundError("Pretrained voice model not found")
101
- except Exception as e:
102
- print(f"Voice model loading failed: {str(e)}")
103
- print("Using limited rule-based voice analysis")
104
- self.model = None
105
- self.scaler = StandardScaler()
106
- dummy_features = np.random.randn(100, 18)
107
- self.scaler.fit(dummy_features)
108
- self.labels = ['neutral', 'happy', 'sad', 'angry', 'fear']
109
-
110
- def extract_features(self, audio):
111
- try:
112
- y, sr = audio
113
- features = []
114
-
115
- if len(y.shape) > 1:
116
- y = np.mean(y, axis=0)
117
-
118
- if sr != 16000:
119
- y = librosa.resample(y, orig_sr=sr, target_sr=16000)
120
- sr = 16000
121
-
122
- mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
123
- features.extend(np.mean(mfccs, axis=1))
124
- features.extend(np.std(mfccs, axis=1))
125
-
126
- pitches = librosa.yin(y, fmin=80, fmax=400)
127
- features.append(np.nanmean(pitches))
128
- features.append(np.nanstd(pitches))
129
-
130
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
131
- features.append(np.mean(spectral_centroid))
132
-
133
- return np.array(features)
134
- except Exception as e:
135
- print(f"Feature extraction error: {str(e)}")
136
- return np.zeros(18) if self.model else np.zeros(13)
137
-
138
- def predict(self, audio):
139
- try:
140
- features = self.extract_features(audio).reshape(1, -1)
141
- features = self.scaler.transform(features)
142
-
143
- if self.model:
144
- probs = self.model.predict_proba(features)[0]
145
- emotion = self.labels[np.argmax(probs)]
146
- details = [{"label": l, "score": p} for l, p in zip(self.labels, probs)]
147
- else:
148
- if features[0, 0] > 1.0:
149
- emotion = "happy"
150
- details = [{"label": "happy", "score": 0.8}]
151
- elif features[0, 0] < -1.0:
152
- emotion = "sad"
153
- details = [{"label": "sad", "score": 0.7}]
154
- elif abs(features[0, 1]) > 0.8:
155
- emotion = "angry"
156
- details = [{"label": "angry", "score": 0.6}]
157
- else:
158
- emotion = "neutral"
159
- details = [{"label": "neutral", "score": 0.9}]
160
-
161
- return emotion, details
162
- except Exception as e:
163
- print(f"Voice prediction error: {str(e)}")
164
- return "neutral", [{"label": "neutral", "score": 1.0}]
165
 
166
- # Initialize models
167
- emotion_model = EmotionModel()
168
- voice_classifier = VoiceEmotionClassifier()
 
 
 
169
 
 
 
 
 
 
 
 
170
 
171
- # Global variables to store results
172
- emotion_history = []
173
- current_emotions = {"face": "neutral", "voice": "neutral"}
174
- last_update_time = time.time()
 
 
175
 
176
- def analyze_face(frame):
177
- """Analyze facial expressions in the frame using ONNX model"""
178
- try:
179
- gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
180
- face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
181
- faces = face_cascade.detectMultiScale(gray, 1.3, 5)
182
-
183
- if len(faces) > 0:
184
- x, y, w, h = faces[0]
185
- face_roi = gray[y:y+h, x:x+w]
186
-
187
- # Correct preprocessing for FER+ model
188
- face_roi = cv2.resize(face_roi, (64, 64))
189
- face_roi = face_roi.astype('float32')
190
- face_roi = (face_roi - 127.5) / 127.5 # Normalize to [-1, 1] range
191
- face_roi = np.expand_dims(face_roi, axis=(0, 1))
192
-
193
- results = emotion_model.predict(face_roi)
194
- emotion_probs = results[0]
195
-
196
- # Only accept predictions with confidence > 0.5
197
- if np.max(emotion_probs) < 0.5:
198
- return "uncertain", {label: 0.0 for label in emotion_model.labels}
199
-
200
- dominant_emotion = emotion_model.labels[np.argmax(emotion_probs)]
201
- emotions = {label: float(prob) for label, prob in zip(emotion_model.labels, emotion_probs)}
202
- return dominant_emotion, emotions
203
-
204
- return "neutral", {label: 0.0 for label in emotion_model.labels}
205
- except Exception as e:
206
- print(f"Face analysis error: {str(e)}")
207
- return "neutral", {label: 0.0 for label in emotion_model.labels}
208
 
209
- def analyze_voice(audio):
210
- """Analyze voice tone from audio"""
211
- return voice_classifier.predict(audio)
212
 
213
- def update_emotion_history(face_emotion, voice_emotion):
214
- """Update the emotion history and current emotions"""
215
- global current_emotions, emotion_history, last_update_time
216
-
217
- current_time = datetime.now().strftime("%H:%M:%S")
218
- current_emotions = {
219
- "face": face_emotion,
220
- "voice": voice_emotion,
221
- "timestamp": current_time
222
- }
223
-
224
- if (time.time() - last_update_time) > 5 or not emotion_history:
225
- emotion_history.append(current_emotions.copy())
226
- last_update_time = time.time()
227
-
228
- if len(emotion_history) > 20:
229
- emotion_history = emotion_history[-20:]
230
 
231
- def get_emotion_timeline():
232
- """Create a timeline DataFrame for display"""
233
- if not emotion_history:
234
- return pd.DataFrame(columns=["Time", "Facial Emotion", "Voice Emotion"])
235
-
236
- df = pd.DataFrame(emotion_history)
237
- df = df.rename(columns={
238
- "timestamp": "Time",
239
- "face": "Facial Emotion",
240
- "voice": "Voice Emotion"
241
- })
242
- return df
243
 
244
- def get_practitioner_advice(face_emotion, voice_emotion):
245
- """Generate suggestions based on detected emotions"""
246
- advice = []
247
 
248
- # Facial emotion advice
249
- if face_emotion in ["sad", "fear"]:
250
- advice.append("Patient appears distressed. Consider speaking more slowly and with reassurance.")
251
- elif face_emotion == "angry":
252
- advice.append("Patient seems frustrated. Acknowledge their concerns and maintain calm demeanor.")
253
- elif face_emotion == "disgust":
254
- advice.append("Patient may be uncomfortable. Check if they're experiencing any discomfort.")
255
- elif face_emotion == "surprise":
256
- advice.append("Patient seems surprised. Ensure they understand all information.")
257
- elif face_emotion == "uncertain":
258
- advice.append("Facial expression unclear. Pay closer attention to verbal cues.")
259
-
260
- # Voice emotion advice
261
- if voice_emotion in ["sad", "fear"]:
262
- advice.append("Patient's tone suggests anxiety. Provide clear explanations and emotional support.")
263
- elif voice_emotion == "angry":
264
- advice.append("Patient sounds upset. Practice active listening and validate their feelings.")
265
- elif voice_emotion == "happy":
266
- advice.append("Patient seems positive. This may be a good time to discuss treatment options.")
267
-
268
- return "\n".join(advice) if advice else "Patient appears neutral. Continue with consultation."
269
 
270
- def process_input(video, audio):
271
- """Process video and audio inputs to detect emotions"""
272
- try:
273
- # Process video frame
274
- if video is not None:
275
- frame = cv2.cvtColor(video, cv2.COLOR_RGB2BGR)
276
- face_emotion, face_details = analyze_face(frame)
277
- else:
278
- face_emotion, face_details = "neutral", {}
279
-
280
- # Process audio
281
- if audio is not None:
282
- voice_emotion, voice_details = analyze_voice(audio)
283
- else:
284
- voice_emotion, voice_details = "neutral", {}
285
-
286
- update_emotion_history(face_emotion, voice_emotion)
287
- timeline_df = get_emotion_timeline()
288
- advice = get_practitioner_advice(face_emotion, voice_emotion)
289
-
290
- return (
291
- face_emotion,
292
- voice_emotion,
293
- timeline_df,
294
- advice,
295
- str(face_details),
296
- str(voice_details)
297
- )
298
- except Exception as e:
299
- print(f"Processing error: {str(e)}")
300
- return (
301
- "Error",
302
- "Error",
303
- pd.DataFrame(),
304
- "System error occurred",
305
- "",
306
- ""
307
- )
308
 
309
- # Gradio interface
310
- with gr.Blocks(title="Patient Emotion Recognition", theme="soft") as demo:
311
- gr.Markdown("# Real-Time Patient Emotion Recognition")
312
- gr.Markdown("Analyze facial expressions and voice tone during medical consultations")
313
-
314
- with gr.Row():
315
- with gr.Column():
316
- video_input = gr.Image(label="Live Camera Feed", streaming=True)
317
- audio_input = gr.Audio(label="Voice Input", sources=["microphone"], type="numpy")
318
- submit_btn = gr.Button("Analyze Emotions")
 
 
 
 
 
319
 
320
- with gr.Column():
321
- current_face = gr.Textbox(label="Current Facial Emotion")
322
- current_voice = gr.Textbox(label="Current Voice Emotion")
323
- advice_output = gr.Textbox(label="Practitioner Suggestions", lines=3)
324
- timeline_output = gr.Dataframe(label="Emotion Timeline", interactive=False)
325
- face_details = gr.Textbox(label="Face Analysis Details", visible=False)
326
- voice_details = gr.Textbox(label="Voice Analysis Details", visible=False)
327
-
328
- # Live processing
329
- video_input.change(
330
- process_input,
331
- inputs=[video_input, audio_input],
332
- outputs=[current_face, current_voice, timeline_output, advice_output, face_details, voice_details],
333
- show_progress="hidden"
334
- )
335
-
336
- audio_input.change(
337
- process_input,
338
- inputs=[video_input, audio_input],
339
- outputs=[current_face, current_voice, timeline_output, advice_output, face_details, voice_details],
340
- show_progress="hidden"
341
- )
342
-
343
- submit_btn.click(
344
- process_input,
345
- inputs=[video_input, audio_input],
346
- outputs=[current_face, current_voice, timeline_output, advice_output, face_details, voice_details]
347
- )
348
 
349
  if __name__ == "__main__":
350
- demo.launch(debug=True, server_name="0.0.0.0", server_port=7860)
 
 
 
1
  import cv2
2
+ import numpy as np
3
+ import pyttsx3
 
 
 
 
4
  import onnxruntime as ort
5
+ import librosa
6
+ import sounddevice as sd
7
+ import tempfile
8
+ import scipy.io.wavfile as wavfile
9
  from sklearn.preprocessing import StandardScaler
10
+ import time
11
+ import os
12
 
13
+ # ------------------- Speech Emotion Recognition Model -------------------
14
+ class SpeechEmotionRecognizer:
15
+ def __init__(self, model_path):
16
+ self.model = ort.InferenceSession(model_path)
17
+ self.input_name = self.model.get_inputs()[0].name
18
+ self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
 
 
19
 
20
+ def extract_features(self, y, sr):
21
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
22
+ mfcc_mean = np.mean(mfcc.T, axis=0)
23
+ scaler = StandardScaler()
24
+ mfcc_scaled = scaler.fit_transform(mfcc_mean.reshape(-1, 1)).flatten()
25
+ return mfcc_scaled
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ def predict_emotion(self, audio_data, sr):
28
+ features = self.extract_features(audio_data, sr)
29
+ input_data = features.reshape(1, -1).astype(np.float32)
30
+ pred = self.model.run(None, {self.input_name: input_data})[0]
31
+ emotion_idx = np.argmax(pred)
32
+ return self.labels[emotion_idx]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # ------------------- Facial Emotion Recognition Model -------------------
35
+ class FacialEmotionRecognizer:
36
+ def __init__(self, model_path):
37
+ self.model = ort.InferenceSession(model_path)
38
+ self.input_name = self.model.get_inputs()[0].name
39
+ self.labels = ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust', 'fear', 'contempt']
40
 
41
+ def predict_emotion(self, face_img):
42
+ face_img = cv2.resize(face_img, (64, 64))
43
+ face_img = face_img.astype('float32') # FER+ expects float32 in [0,255]
44
+ face_img = np.expand_dims(face_img, axis=(0, 1)) # Shape: (1, 1, 64, 64)
45
+ pred = self.model.run(None, {self.input_name: face_img})[0]
46
+ emotion_idx = np.argmax(pred)
47
+ return self.labels[emotion_idx]
48
 
49
+ # ------------------- Utility Functions -------------------
50
+ def speak(text):
51
+ engine = pyttsx3.init()
52
+ engine.setProperty('rate', 150)
53
+ engine.say(text)
54
+ engine.runAndWait()
55
 
56
+ def record_audio(duration=3, fs=22050):
57
+ print("Recording audio...")
58
+ audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
59
+ sd.wait()
60
+ audio = audio.flatten()
61
+ print("Recording complete.")
62
+ return audio, fs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ def analyze_face(face_roi, emotion_model):
65
+ emotion = emotion_model.predict_emotion(face_roi)
66
+ return emotion
67
 
68
+ # ------------------- Main Function -------------------
69
+ def main():
70
+ # Load models
71
+ face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx")
72
+ speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx") # Replace with your .onnx model
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ # Start webcam
75
+ cap = cv2.VideoCapture(0)
76
+ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
 
 
 
 
 
 
 
 
 
77
 
78
+ print("Press 's' to speak and 'q' to quit.")
 
 
79
 
80
+ while True:
81
+ ret, frame = cap.read()
82
+ if not ret:
83
+ print("Failed to grab frame.")
84
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
87
+ faces = face_cascade.detectMultiScale(gray, 1.3, 5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ for (x, y, w, h) in faces:
90
+ face_roi = gray[y:y+h, x:x+w]
91
+ emotion = analyze_face(face_roi, face_emotion_model)
92
+ label = f"Face: {emotion}"
93
+ cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
94
+ cv2.putText(frame, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2)
95
+
96
+ cv2.imshow("Emotion Recognition", frame)
97
+ key = cv2.waitKey(1) & 0xFF
98
+
99
+ if key == ord('s'):
100
+ audio, sr = record_audio()
101
+ speech_emotion = speech_emotion_model.predict_emotion(audio, sr)
102
+ print(f"Speech Emotion: {speech_emotion}")
103
+ speak(f"You sound {speech_emotion}")
104
 
105
+ elif key == ord('q'):
106
+ break
107
+
108
+ cap.release()
109
+ cv2.destroyAllWindows()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  if __name__ == "__main__":
112
+ main()