import cv2
import numpy as np
import pyttsx3
import onnxruntime as ort
import librosa
import sounddevice as sd
import tempfile
import scipy.io.wavfile as wavfile
from sklearn.preprocessing import StandardScaler
import time
import os

# ------------------- Speech Emotion Recognition Model -------------------
class SpeechEmotionRecognizer:
    def __init__(self, model_path):
        self.model = ort.InferenceSession(model_path)
        self.input_name = self.model.get_inputs()[0].name
        self.labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

    def extract_features(self, y, sr):
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfcc_mean = np.mean(mfcc.T, axis=0)
        scaler = StandardScaler()
        mfcc_scaled = scaler.fit_transform(mfcc_mean.reshape(-1, 1)).flatten()
        return mfcc_scaled

    def predict_emotion(self, audio_data, sr):
        features = self.extract_features(audio_data, sr)
        input_data = features.reshape(1, -1).astype(np.float32)
        pred = self.model.run(None, {self.input_name: input_data})[0]
        emotion_idx = np.argmax(pred)
        return self.labels[emotion_idx]

# ------------------- Facial Emotion Recognition Model -------------------
class FacialEmotionRecognizer:
    def __init__(self, model_path):
        self.model = ort.InferenceSession(model_path)
        self.input_name = self.model.get_inputs()[0].name
        self.labels = ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust', 'fear', 'contempt']

    def predict_emotion(self, face_img):
        face_img = cv2.resize(face_img, (64, 64))
        face_img = face_img.astype('float32')  # FER+ expects float32 in [0,255]
        face_img = np.expand_dims(face_img, axis=(0, 1))  # Shape: (1, 1, 64, 64)
        pred = self.model.run(None, {self.input_name: face_img})[0]
        emotion_idx = np.argmax(pred)
        return self.labels[emotion_idx]

# ------------------- Utility Functions -------------------
def speak(text):
    engine = pyttsx3.init()
    engine.setProperty('rate', 150)
    engine.say(text)
    engine.runAndWait()

def record_audio(duration=3, fs=22050):
    print("Recording audio...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
    sd.wait()
    audio = audio.flatten()
    print("Recording complete.")
    return audio, fs

def analyze_face(face_roi, emotion_model):
    emotion = emotion_model.predict_emotion(face_roi)
    return emotion

# ------------------- Main Function -------------------
def main():
    # Load models
    face_emotion_model = FacialEmotionRecognizer("emotion-ferplus-8.onnx")
    speech_emotion_model = SpeechEmotionRecognizer("speech_emotion_model.onnx")  # Replace with your .onnx model

    # Start webcam
    cap = cv2.VideoCapture(0)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

    print("Press 's' to speak and 'q' to quit.")
    
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame.")
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, 1.3, 5)

        for (x, y, w, h) in faces:
            face_roi = gray[y:y+h, x:x+w]
            emotion = analyze_face(face_roi, face_emotion_model)
            label = f"Face: {emotion}"
            cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
            cv2.putText(frame, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2)

        cv2.imshow("Emotion Recognition", frame)
        key = cv2.waitKey(1) & 0xFF

        if key == ord('s'):
            audio, sr = record_audio()
            speech_emotion = speech_emotion_model.predict_emotion(audio, sr)
            print(f"Speech Emotion: {speech_emotion}")
            speak(f"You sound {speech_emotion}")
        
        elif key == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()