Spaces:

yunusajib
/

Real-Time-Emotional-Detection

Sleeping

App Files Files Community

yunusajib commited on Jun 11

Commit

9d66904

verified ·

1 Parent(s): 0513e21

update app

Browse files

Files changed (1) hide show

app.py +21 -23

app.py CHANGED Viewed

@@ -4,13 +4,13 @@ import pyttsx3
 import onnxruntime as ort
 import librosa
 import sounddevice as sd
-import tempfile
 import scipy.io.wavfile as wavfile
 from sklearn.preprocessing import StandardScaler
 import time
 import os
 from gtts import gTTS
 import gradio as gr
 # ------------------- Speech Emotion Recognition Model -------------------
 class SpeechEmotionRecognizer:
@@ -19,11 +19,14 @@ class SpeechEmotionRecognizer:
         self.input_name = self.model.get_inputs()[0].name
         self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
     def extract_features(self, y, sr):
         mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
         mfcc_mean = np.mean(mfcc.T, axis=0)
-        scaler = StandardScaler()
-        mfcc_scaled = scaler.fit_transform(mfcc_mean.reshape(-1, 1)).flatten()
         return mfcc_scaled
     def predict_emotion(self, audio_data, sr):
@@ -42,30 +45,23 @@ class FacialEmotionRecognizer:
     def predict_emotion(self, face_img):
         face_img = cv2.resize(face_img, (64, 64))
-        face_img = face_img.astype('float32')  # FER+ expects float32 in [0,255]
-        face_img = np.expand_dims(face_img, axis=(0, 1))  # Shape: (1, 1, 64, 64)
         pred = self.model.run(None, {self.input_name: face_img})[0]
         emotion_idx = np.argmax(pred)
         return self.labels[emotion_idx]
 # ------------------- Utility Functions -------------------
 def speak(text):
     if not text.strip():
         return None
-    tts = gTTS(text)
-    tts.save("output.mp3")
-    return "output.mp3"
-iface = gr.Interface(
-    fn=speak,
-    inputs=gr.Textbox(lines=2, label="Enter text"),
-    outputs=gr.Audio(type="filepath", label="Speech Output"),
-    title="Text to Speech"
-)
-iface.launch()
 def record_audio(duration=3, fs=22050):
     print("Recording audio...")
@@ -80,12 +76,11 @@ def analyze_face(face_roi, emotion_model):
     return emotion
 # ------------------- Main Function -------------------
 def main():
-    # Load models
     face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx")
-    speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx")  # Replace with your .onnx model
-    # Start webcam
     cap = cv2.VideoCapture(0)
     face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
@@ -114,8 +109,11 @@ def main():
             audio, sr = record_audio()
             speech_emotion = speech_emotion_model.predict_emotion(audio, sr)
             print(f"Speech Emotion: {speech_emotion}")
-            speak(f"You sound {speech_emotion}")
         elif key == ord('q'):
             break

 import onnxruntime as ort
 import librosa
 import sounddevice as sd
 import scipy.io.wavfile as wavfile
 from sklearn.preprocessing import StandardScaler
 import time
 import os
 from gtts import gTTS
 import gradio as gr
+import tempfile
 # ------------------- Speech Emotion Recognition Model -------------------
 class SpeechEmotionRecognizer:
         self.input_name = self.model.get_inputs()[0].name
         self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
+        # Load or create scaler here (fit on training data offline, then load)
+        self.scaler = StandardScaler()
     def extract_features(self, y, sr):
         mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
         mfcc_mean = np.mean(mfcc.T, axis=0)
+        # Normally, scaler should be pre-fitted, here we just scale manually to zero mean, unit var
+        mfcc_scaled = (mfcc_mean - np.mean(mfcc_mean)) / np.std(mfcc_mean)
         return mfcc_scaled
     def predict_emotion(self, audio_data, sr):
     def predict_emotion(self, face_img):
         face_img = cv2.resize(face_img, (64, 64))
+        face_img = face_img.astype('float32')  # FER+ expects float32
+        # FER+ model expects input shape (1, 1, 64, 64)
+        face_img = np.expand_dims(face_img, axis=0)  # (1, 64, 64)
+        face_img = np.expand_dims(face_img, axis=0)  # (1, 1, 64, 64)
         pred = self.model.run(None, {self.input_name: face_img})[0]
         emotion_idx = np.argmax(pred)
         return self.labels[emotion_idx]
 # ------------------- Utility Functions -------------------
 def speak(text):
     if not text.strip():
         return None
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
+        tts = gTTS(text)
+        tts.save(tmpfile.name)
+        return tmpfile.name
 def record_audio(duration=3, fs=22050):
     print("Recording audio...")
     return emotion
 # ------------------- Main Function -------------------
 def main():
     face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx")
+    speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx")
     cap = cv2.VideoCapture(0)
     face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
             audio, sr = record_audio()
             speech_emotion = speech_emotion_model.predict_emotion(audio, sr)
             print(f"Speech Emotion: {speech_emotion}")
+            audio_file = speak(f"You sound {speech_emotion}")
+            if audio_file:
+                # Play the TTS audio using cv2 or other player if needed
+                pass
         elif key == ord('q'):
             break