File size: 4,539 Bytes
2754fd7 ef2be41 9f0f4cc ef2be41 5eff629 ef2be41 0513e21 9d66904 9f0f4cc ef2be41 9f0f4cc 9d66904 ef2be41 9d66904 ef2be41 2754fd7 ef2be41 5eff629 ef2be41 5eff629 ef2be41 9d66904 ef2be41 22bda4b ef2be41 0513e21 ef2be41 0513e21 9d66904 2754fd7 ef2be41 2754fd7 ef2be41 2754fd7 ef2be41 9d66904 ef2be41 9d66904 2754fd7 ef2be41 2754fd7 ef2be41 2754fd7 ef2be41 2754fd7 ef2be41 3baa918 ef2be41 9d66904 ef2be41 3baa918 ef2be41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import cv2
import numpy as np
import pyttsx3
import onnxruntime as ort
import librosa
import sounddevice as sd
import scipy.io.wavfile as wavfile
from sklearn.preprocessing import StandardScaler
import time
import os
from gtts import gTTS
import gradio as gr
import tempfile
# ------------------- Speech Emotion Recognition Model -------------------
class SpeechEmotionRecognizer:
def __init__(self, model_path):
self.model = ort.InferenceSession(model_path)
self.input_name = self.model.get_inputs()[0].name
self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
# Load or create scaler here (fit on training data offline, then load)
self.scaler = StandardScaler()
def extract_features(self, y, sr):
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
mfcc_mean = np.mean(mfcc.T, axis=0)
# Normally, scaler should be pre-fitted, here we just scale manually to zero mean, unit var
mfcc_scaled = (mfcc_mean - np.mean(mfcc_mean)) / np.std(mfcc_mean)
return mfcc_scaled
def predict_emotion(self, audio_data, sr):
features = self.extract_features(audio_data, sr)
input_data = features.reshape(1, -1).astype(np.float32)
pred = self.model.run(None, {self.input_name: input_data})[0]
emotion_idx = np.argmax(pred)
return self.labels[emotion_idx]
# ------------------- Facial Emotion Recognition Model -------------------
class FacialEmotionRecognizer:
def __init__(self, model_path):
self.model = ort.InferenceSession(model_path)
self.input_name = self.model.get_inputs()[0].name
self.labels = ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust', 'fear', 'contempt']
def predict_emotion(self, face_img):
face_img = cv2.resize(face_img, (64, 64))
face_img = face_img.astype('float32') # FER+ expects float32
# FER+ model expects input shape (1, 1, 64, 64)
face_img = np.expand_dims(face_img, axis=0) # (1, 64, 64)
face_img = np.expand_dims(face_img, axis=0) # (1, 1, 64, 64)
pred = self.model.run(None, {self.input_name: face_img})[0]
emotion_idx = np.argmax(pred)
return self.labels[emotion_idx]
# ------------------- Utility Functions -------------------
def speak(text):
if not text.strip():
return None
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
tts = gTTS(text)
tts.save(tmpfile.name)
return tmpfile.name
def record_audio(duration=3, fs=22050):
print("Recording audio...")
audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
sd.wait()
audio = audio.flatten()
print("Recording complete.")
return audio, fs
def analyze_face(face_roi, emotion_model):
emotion = emotion_model.predict_emotion(face_roi)
return emotion
# ------------------- Main Function -------------------
def main():
face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx")
speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx")
cap = cv2.VideoCapture(0)
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
print("Press 's' to speak and 'q' to quit.")
while True:
ret, frame = cap.read()
if not ret:
print("Failed to grab frame.")
break
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x, y, w, h) in faces:
face_roi = gray[y:y+h, x:x+w]
emotion = analyze_face(face_roi, face_emotion_model)
label = f"Face: {emotion}"
cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
cv2.putText(frame, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2)
cv2.imshow("Emotion Recognition", frame)
key = cv2.waitKey(1) & 0xFF
if key == ord('s'):
audio, sr = record_audio()
speech_emotion = speech_emotion_model.predict_emotion(audio, sr)
print(f"Speech Emotion: {speech_emotion}")
audio_file = speak(f"You sound {speech_emotion}")
if audio_file:
# Play the TTS audio using cv2 or other player if needed
pass
elif key == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
|