import cv2 import numpy as np import pyttsx3 import onnxruntime as ort import librosa import sounddevice as sd import tempfile import scipy.io.wavfile as wavfile from sklearn.preprocessing import StandardScaler import time import os # ------------------- Speech Emotion Recognition Model ------------------- class SpeechEmotionRecognizer: def __init__(self, model_path): self.model = ort.InferenceSession(model_path) self.input_name = self.model.get_inputs()[0].name self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'] def extract_features(self, y, sr): mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) mfcc_mean = np.mean(mfcc.T, axis=0) scaler = StandardScaler() mfcc_scaled = scaler.fit_transform(mfcc_mean.reshape(-1, 1)).flatten() return mfcc_scaled def predict_emotion(self, audio_data, sr): features = self.extract_features(audio_data, sr) input_data = features.reshape(1, -1).astype(np.float32) pred = self.model.run(None, {self.input_name: input_data})[0] emotion_idx = np.argmax(pred) return self.labels[emotion_idx] # ------------------- Facial Emotion Recognition Model ------------------- class FacialEmotionRecognizer: def __init__(self, model_path): self.model = ort.InferenceSession(model_path) self.input_name = self.model.get_inputs()[0].name self.labels = ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust', 'fear', 'contempt'] def predict_emotion(self, face_img): face_img = cv2.resize(face_img, (64, 64)) face_img = face_img.astype('float32') # FER+ expects float32 in [0,255] face_img = np.expand_dims(face_img, axis=(0, 1)) # Shape: (1, 1, 64, 64) pred = self.model.run(None, {self.input_name: face_img})[0] emotion_idx = np.argmax(pred) return self.labels[emotion_idx] # ------------------- Utility Functions ------------------- def speak(text): engine = pyttsx3.init() engine.setProperty('rate', 150) engine.say(text) engine.runAndWait() def record_audio(duration=3, fs=22050): print("Recording audio...") audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32') sd.wait() audio = audio.flatten() print("Recording complete.") return audio, fs def analyze_face(face_roi, emotion_model): emotion = emotion_model.predict_emotion(face_roi) return emotion # ------------------- Main Function ------------------- def main(): # Load models face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx") speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx") # Replace with your .onnx model # Start webcam cap = cv2.VideoCapture(0) face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml") print("Press 's' to speak and 'q' to quit.") while True: ret, frame = cap.read() if not ret: print("Failed to grab frame.") break gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) faces = face_cascade.detectMultiScale(gray, 1.3, 5) for (x, y, w, h) in faces: face_roi = gray[y:y+h, x:x+w] emotion = analyze_face(face_roi, face_emotion_model) label = f"Face: {emotion}" cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2) cv2.putText(frame, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2) cv2.imshow("Emotion Recognition", frame) key = cv2.waitKey(1) & 0xFF if key == ord('s'): audio, sr = record_audio() speech_emotion = speech_emotion_model.predict_emotion(audio, sr) print(f"Speech Emotion: {speech_emotion}") speak(f"You sound {speech_emotion}") elif key == ord('q'): break cap.release() cv2.destroyAllWindows() if __name__ == "__main__": main()