yunusajib commited on
Commit
9d66904
·
verified ·
1 Parent(s): 0513e21

update app

Browse files
Files changed (1) hide show
  1. app.py +21 -23
app.py CHANGED
@@ -4,13 +4,13 @@ import pyttsx3
4
  import onnxruntime as ort
5
  import librosa
6
  import sounddevice as sd
7
- import tempfile
8
  import scipy.io.wavfile as wavfile
9
  from sklearn.preprocessing import StandardScaler
10
  import time
11
  import os
12
  from gtts import gTTS
13
  import gradio as gr
 
14
 
15
  # ------------------- Speech Emotion Recognition Model -------------------
16
  class SpeechEmotionRecognizer:
@@ -19,11 +19,14 @@ class SpeechEmotionRecognizer:
19
  self.input_name = self.model.get_inputs()[0].name
20
  self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
21
 
 
 
 
22
  def extract_features(self, y, sr):
23
  mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
24
  mfcc_mean = np.mean(mfcc.T, axis=0)
25
- scaler = StandardScaler()
26
- mfcc_scaled = scaler.fit_transform(mfcc_mean.reshape(-1, 1)).flatten()
27
  return mfcc_scaled
28
 
29
  def predict_emotion(self, audio_data, sr):
@@ -42,30 +45,23 @@ class FacialEmotionRecognizer:
42
 
43
  def predict_emotion(self, face_img):
44
  face_img = cv2.resize(face_img, (64, 64))
45
- face_img = face_img.astype('float32') # FER+ expects float32 in [0,255]
46
- face_img = np.expand_dims(face_img, axis=(0, 1)) # Shape: (1, 1, 64, 64)
 
 
47
  pred = self.model.run(None, {self.input_name: face_img})[0]
48
  emotion_idx = np.argmax(pred)
49
  return self.labels[emotion_idx]
50
 
51
  # ------------------- Utility Functions -------------------
52
 
53
-
54
  def speak(text):
55
  if not text.strip():
56
  return None
57
- tts = gTTS(text)
58
- tts.save("output.mp3")
59
- return "output.mp3"
60
-
61
- iface = gr.Interface(
62
- fn=speak,
63
- inputs=gr.Textbox(lines=2, label="Enter text"),
64
- outputs=gr.Audio(type="filepath", label="Speech Output"),
65
- title="Text to Speech"
66
- )
67
-
68
- iface.launch()
69
 
70
  def record_audio(duration=3, fs=22050):
71
  print("Recording audio...")
@@ -80,12 +76,11 @@ def analyze_face(face_roi, emotion_model):
80
  return emotion
81
 
82
  # ------------------- Main Function -------------------
 
83
  def main():
84
- # Load models
85
  face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx")
86
- speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx") # Replace with your .onnx model
87
 
88
- # Start webcam
89
  cap = cv2.VideoCapture(0)
90
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
91
 
@@ -114,8 +109,11 @@ def main():
114
  audio, sr = record_audio()
115
  speech_emotion = speech_emotion_model.predict_emotion(audio, sr)
116
  print(f"Speech Emotion: {speech_emotion}")
117
- speak(f"You sound {speech_emotion}")
118
-
 
 
 
119
  elif key == ord('q'):
120
  break
121
 
 
4
  import onnxruntime as ort
5
  import librosa
6
  import sounddevice as sd
 
7
  import scipy.io.wavfile as wavfile
8
  from sklearn.preprocessing import StandardScaler
9
  import time
10
  import os
11
  from gtts import gTTS
12
  import gradio as gr
13
+ import tempfile
14
 
15
  # ------------------- Speech Emotion Recognition Model -------------------
16
  class SpeechEmotionRecognizer:
 
19
  self.input_name = self.model.get_inputs()[0].name
20
  self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
21
 
22
+ # Load or create scaler here (fit on training data offline, then load)
23
+ self.scaler = StandardScaler()
24
+
25
  def extract_features(self, y, sr):
26
  mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
27
  mfcc_mean = np.mean(mfcc.T, axis=0)
28
+ # Normally, scaler should be pre-fitted, here we just scale manually to zero mean, unit var
29
+ mfcc_scaled = (mfcc_mean - np.mean(mfcc_mean)) / np.std(mfcc_mean)
30
  return mfcc_scaled
31
 
32
  def predict_emotion(self, audio_data, sr):
 
45
 
46
  def predict_emotion(self, face_img):
47
  face_img = cv2.resize(face_img, (64, 64))
48
+ face_img = face_img.astype('float32') # FER+ expects float32
49
+ # FER+ model expects input shape (1, 1, 64, 64)
50
+ face_img = np.expand_dims(face_img, axis=0) # (1, 64, 64)
51
+ face_img = np.expand_dims(face_img, axis=0) # (1, 1, 64, 64)
52
  pred = self.model.run(None, {self.input_name: face_img})[0]
53
  emotion_idx = np.argmax(pred)
54
  return self.labels[emotion_idx]
55
 
56
  # ------------------- Utility Functions -------------------
57
 
 
58
  def speak(text):
59
  if not text.strip():
60
  return None
61
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
62
+ tts = gTTS(text)
63
+ tts.save(tmpfile.name)
64
+ return tmpfile.name
 
 
 
 
 
 
 
 
65
 
66
  def record_audio(duration=3, fs=22050):
67
  print("Recording audio...")
 
76
  return emotion
77
 
78
  # ------------------- Main Function -------------------
79
+
80
  def main():
 
81
  face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx")
82
+ speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx")
83
 
 
84
  cap = cv2.VideoCapture(0)
85
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
86
 
 
109
  audio, sr = record_audio()
110
  speech_emotion = speech_emotion_model.predict_emotion(audio, sr)
111
  print(f"Speech Emotion: {speech_emotion}")
112
+ audio_file = speak(f"You sound {speech_emotion}")
113
+ if audio_file:
114
+ # Play the TTS audio using cv2 or other player if needed
115
+ pass
116
+
117
  elif key == ord('q'):
118
  break
119