File size: 4,539 Bytes
2754fd7
ef2be41
 
9f0f4cc
ef2be41
 
 
5eff629
ef2be41
 
0513e21
 
9d66904
9f0f4cc
ef2be41
 
 
 
 
 
9f0f4cc
9d66904
 
 
ef2be41
 
 
9d66904
 
ef2be41
2754fd7
ef2be41
 
 
 
 
 
5eff629
ef2be41
 
 
 
 
 
5eff629
ef2be41
 
9d66904
 
 
 
ef2be41
 
 
22bda4b
ef2be41
0513e21
ef2be41
0513e21
 
9d66904
 
 
 
2754fd7
ef2be41
 
 
 
 
 
 
2754fd7
ef2be41
 
 
2754fd7
ef2be41
9d66904
ef2be41
 
9d66904
2754fd7
ef2be41
 
2754fd7
ef2be41
2754fd7
ef2be41
 
 
 
 
2754fd7
ef2be41
 
3baa918
ef2be41
 
 
 
 
 
 
 
 
 
 
 
 
 
9d66904
 
 
 
 
ef2be41
 
 
 
 
3baa918
 
ef2be41
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import cv2
import numpy as np
import pyttsx3
import onnxruntime as ort
import librosa
import sounddevice as sd
import scipy.io.wavfile as wavfile
from sklearn.preprocessing import StandardScaler
import time
import os
from gtts import gTTS
import gradio as gr
import tempfile

# ------------------- Speech Emotion Recognition Model -------------------
class SpeechEmotionRecognizer:
    def __init__(self, model_path):
        self.model = ort.InferenceSession(model_path)
        self.input_name = self.model.get_inputs()[0].name
        self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

        # Load or create scaler here (fit on training data offline, then load)
        self.scaler = StandardScaler()

    def extract_features(self, y, sr):
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfcc_mean = np.mean(mfcc.T, axis=0)
        # Normally, scaler should be pre-fitted, here we just scale manually to zero mean, unit var
        mfcc_scaled = (mfcc_mean - np.mean(mfcc_mean)) / np.std(mfcc_mean)
        return mfcc_scaled

    def predict_emotion(self, audio_data, sr):
        features = self.extract_features(audio_data, sr)
        input_data = features.reshape(1, -1).astype(np.float32)
        pred = self.model.run(None, {self.input_name: input_data})[0]
        emotion_idx = np.argmax(pred)
        return self.labels[emotion_idx]

# ------------------- Facial Emotion Recognition Model -------------------
class FacialEmotionRecognizer:
    def __init__(self, model_path):
        self.model = ort.InferenceSession(model_path)
        self.input_name = self.model.get_inputs()[0].name
        self.labels = ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust', 'fear', 'contempt']

    def predict_emotion(self, face_img):
        face_img = cv2.resize(face_img, (64, 64))
        face_img = face_img.astype('float32')  # FER+ expects float32
        # FER+ model expects input shape (1, 1, 64, 64)
        face_img = np.expand_dims(face_img, axis=0)  # (1, 64, 64)
        face_img = np.expand_dims(face_img, axis=0)  # (1, 1, 64, 64)
        pred = self.model.run(None, {self.input_name: face_img})[0]
        emotion_idx = np.argmax(pred)
        return self.labels[emotion_idx]

# ------------------- Utility Functions -------------------

def speak(text):
    if not text.strip():
        return None
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
        tts = gTTS(text)
        tts.save(tmpfile.name)
        return tmpfile.name

def record_audio(duration=3, fs=22050):
    print("Recording audio...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
    sd.wait()
    audio = audio.flatten()
    print("Recording complete.")
    return audio, fs

def analyze_face(face_roi, emotion_model):
    emotion = emotion_model.predict_emotion(face_roi)
    return emotion

# ------------------- Main Function -------------------

def main():
    face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx")
    speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx")

    cap = cv2.VideoCapture(0)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

    print("Press 's' to speak and 'q' to quit.")
    
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame.")
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, 1.3, 5)

        for (x, y, w, h) in faces:
            face_roi = gray[y:y+h, x:x+w]
            emotion = analyze_face(face_roi, face_emotion_model)
            label = f"Face: {emotion}"
            cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
            cv2.putText(frame, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2)

        cv2.imshow("Emotion Recognition", frame)
        key = cv2.waitKey(1) & 0xFF

        if key == ord('s'):
            audio, sr = record_audio()
            speech_emotion = speech_emotion_model.predict_emotion(audio, sr)
            print(f"Speech Emotion: {speech_emotion}")
            audio_file = speak(f"You sound {speech_emotion}")
            if audio_file:
                # Play the TTS audio using cv2 or other player if needed
                pass

        elif key == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()