|
import cv2 |
|
import numpy as np |
|
import pyttsx3 |
|
import onnxruntime as ort |
|
import librosa |
|
import sounddevice as sd |
|
import tempfile |
|
import scipy.io.wavfile as wavfile |
|
from sklearn.preprocessing import StandardScaler |
|
import time |
|
import os |
|
|
|
|
|
class SpeechEmotionRecognizer: |
|
def __init__(self, model_path): |
|
self.model = ort.InferenceSession(model_path) |
|
self.input_name = self.model.get_inputs()[0].name |
|
self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'] |
|
|
|
def extract_features(self, y, sr): |
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) |
|
mfcc_mean = np.mean(mfcc.T, axis=0) |
|
scaler = StandardScaler() |
|
mfcc_scaled = scaler.fit_transform(mfcc_mean.reshape(-1, 1)).flatten() |
|
return mfcc_scaled |
|
|
|
def predict_emotion(self, audio_data, sr): |
|
features = self.extract_features(audio_data, sr) |
|
input_data = features.reshape(1, -1).astype(np.float32) |
|
pred = self.model.run(None, {self.input_name: input_data})[0] |
|
emotion_idx = np.argmax(pred) |
|
return self.labels[emotion_idx] |
|
|
|
|
|
class FacialEmotionRecognizer: |
|
def __init__(self, model_path): |
|
self.model = ort.InferenceSession(model_path) |
|
self.input_name = self.model.get_inputs()[0].name |
|
self.labels = ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust', 'fear', 'contempt'] |
|
|
|
def predict_emotion(self, face_img): |
|
face_img = cv2.resize(face_img, (64, 64)) |
|
face_img = face_img.astype('float32') |
|
face_img = np.expand_dims(face_img, axis=(0, 1)) |
|
pred = self.model.run(None, {self.input_name: face_img})[0] |
|
emotion_idx = np.argmax(pred) |
|
return self.labels[emotion_idx] |
|
|
|
|
|
def speak(text): |
|
engine = pyttsx3.init() |
|
engine.setProperty('rate', 150) |
|
engine.say(text) |
|
engine.runAndWait() |
|
|
|
def record_audio(duration=3, fs=22050): |
|
print("Recording audio...") |
|
audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32') |
|
sd.wait() |
|
audio = audio.flatten() |
|
print("Recording complete.") |
|
return audio, fs |
|
|
|
def analyze_face(face_roi, emotion_model): |
|
emotion = emotion_model.predict_emotion(face_roi) |
|
return emotion |
|
|
|
|
|
def main(): |
|
|
|
face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx") |
|
speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx") |
|
|
|
|
|
cap = cv2.VideoCapture(0) |
|
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml") |
|
|
|
print("Press 's' to speak and 'q' to quit.") |
|
|
|
while True: |
|
ret, frame = cap.read() |
|
if not ret: |
|
print("Failed to grab frame.") |
|
break |
|
|
|
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) |
|
faces = face_cascade.detectMultiScale(gray, 1.3, 5) |
|
|
|
for (x, y, w, h) in faces: |
|
face_roi = gray[y:y+h, x:x+w] |
|
emotion = analyze_face(face_roi, face_emotion_model) |
|
label = f"Face: {emotion}" |
|
cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2) |
|
cv2.putText(frame, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2) |
|
|
|
cv2.imshow("Emotion Recognition", frame) |
|
key = cv2.waitKey(1) & 0xFF |
|
|
|
if key == ord('s'): |
|
audio, sr = record_audio() |
|
speech_emotion = speech_emotion_model.predict_emotion(audio, sr) |
|
print(f"Speech Emotion: {speech_emotion}") |
|
speak(f"You sound {speech_emotion}") |
|
|
|
elif key == ord('q'): |
|
break |
|
|
|
cap.release() |
|
cv2.destroyAllWindows() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|