|
import gradio as gr |
|
import numpy as np |
|
import cv2 |
|
import pandas as pd |
|
from datetime import datetime |
|
import time |
|
import librosa |
|
from python_speech_features import mfcc |
|
import onnxruntime as ort |
|
import requests |
|
import os |
|
from sklearn.preprocessing import StandardScaler |
|
import joblib |
|
|
|
|
|
MODEL_URL = "https://github.com/onnx/models/raw/main/vision/body_analysis/emotion_ferplus/model/emotion-ferplus-8.onnx" |
|
MODEL_PATH = "emotion-ferplus-8.onnx" |
|
|
|
if not os.path.exists(MODEL_PATH): |
|
print("Downloading emotion recognition model...") |
|
response = requests.get(MODEL_URL) |
|
with open(MODEL_PATH, "wb") as f: |
|
f.write(response.content) |
|
|
|
|
|
emotion_session = ort.InferenceSession(MODEL_PATH) |
|
emotion_labels = ['neutral', 'happy', 'surprise', 'sad', 'angry', 'disgust', 'fear', 'contempt'] |
|
|
|
|
|
class VoiceEmotionClassifier: |
|
def __init__(self): |
|
self.scaler = StandardScaler() |
|
|
|
def extract_features(self, audio): |
|
sr, y = audio |
|
y = y.astype(np.float32) |
|
|
|
|
|
if len(y.shape) > 1: |
|
y = np.mean(y, axis=0) |
|
|
|
|
|
if sr != 16000: |
|
y = librosa.resample(y, orig_sr=sr, target_sr=16000) |
|
sr = 16000 |
|
|
|
|
|
mfcc_features = mfcc(y, sr, numcep=13) |
|
return np.mean(mfcc_features, axis=0) |
|
|
|
def predict(self, audio): |
|
try: |
|
features = self.extract_features(audio).reshape(1, -1) |
|
features = self.scaler.transform(features) |
|
|
|
|
|
|
|
if features[0, 0] > 0.5: |
|
return "happy", [{"label": "happy", "score": 0.8}] |
|
elif features[0, 0] < -0.5: |
|
return "sad", [{"label": "sad", "score": 0.7}] |
|
else: |
|
return "neutral", [{"label": "neutral", "score": 0.9}] |
|
except Exception as e: |
|
print(f"Voice analysis error: {e}") |
|
return "neutral", [{"label": "neutral", "score": 1.0}] |
|
|
|
|
|
voice_classifier = VoiceEmotionClassifier() |
|
|
|
|
|
emotion_history = [] |
|
current_emotions = {"face": "neutral", "voice": "neutral"} |
|
last_update_time = time.time() |
|
|
|
def analyze_face(frame): |
|
"""Analyze facial expressions in the frame using ONNX model""" |
|
try: |
|
|
|
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) |
|
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') |
|
faces = face_cascade.detectMultiScale(gray, 1.3, 5) |
|
|
|
if len(faces) > 0: |
|
x, y, w, h = faces[0] |
|
face_roi = gray[y:y+h, x:x+w] |
|
face_roi = cv2.resize(face_roi, (64, 64)) |
|
face_roi = face_roi.astype('float32') / 255.0 |
|
face_roi = np.expand_dims(face_roi, axis=0) |
|
face_roi = np.expand_dims(face_roi, axis=0) |
|
|
|
|
|
input_name = emotion_session.get_inputs()[0].name |
|
output_name = emotion_session.get_outputs()[0].name |
|
results = emotion_session.run([output_name], {input_name: face_roi})[0] |
|
|
|
|
|
emotion_probs = results[0] |
|
dominant_emotion = emotion_labels[np.argmax(emotion_probs)] |
|
|
|
|
|
emotions = {label: float(prob) for label, prob in zip(emotion_labels, emotion_probs)} |
|
return dominant_emotion, emotions |
|
|
|
return "neutral", {label: 0.0 for label in emotion_labels} |
|
except Exception as e: |
|
print(f"Face analysis error: {e}") |
|
return "neutral", {label: 0.0 for label in emotion_labels} |
|
|
|
def analyze_voice(audio): |
|
"""Analyze voice tone from audio""" |
|
return voice_classifier.predict(audio) |
|
|
|
def update_emotion_history(face_emotion, voice_emotion): |
|
"""Update the emotion history and current emotions""" |
|
global current_emotions, emotion_history, last_update_time |
|
|
|
current_time = datetime.now().strftime("%H:%M:%S") |
|
|
|
|
|
current_emotions = { |
|
"face": face_emotion, |
|
"voice": voice_emotion, |
|
"timestamp": current_time |
|
} |
|
|
|
|
|
if (time.time() - last_update_time) > 5 or not emotion_history: |
|
emotion_history.append({ |
|
"timestamp": current_time, |
|
"face": face_emotion, |
|
"voice": voice_emotion |
|
}) |
|
last_update_time = time.time() |
|
|
|
|
|
if len(emotion_history) > 20: |
|
emotion_history = emotion_history[-20:] |
|
|
|
def get_emotion_timeline(): |
|
"""Create a timeline DataFrame for display""" |
|
if not emotion_history: |
|
return pd.DataFrame(columns=["Time", "Facial Emotion", "Voice Emotion"]) |
|
|
|
df = pd.DataFrame(emotion_history) |
|
df = df.rename(columns={ |
|
"timestamp": "Time", |
|
"face": "Facial Emotion", |
|
"voice": "Voice Emotion" |
|
}) |
|
return df |
|
|
|
def get_practitioner_advice(face_emotion, voice_emotion): |
|
"""Generate suggestions based on detected emotions""" |
|
advice = [] |
|
|
|
|
|
if face_emotion in ["sad", "fear"]: |
|
advice.append("Patient appears distressed. Consider speaking more slowly and with reassurance.") |
|
elif face_emotion == "angry": |
|
advice.append("Patient seems frustrated. Acknowledge their concerns and maintain calm demeanor.") |
|
elif face_emotion == "disgust": |
|
advice.append("Patient may be uncomfortable. Check if they're experiencing any discomfort.") |
|
elif face_emotion == "surprise": |
|
advice.append("Patient seems surprised. Ensure they understand all information.") |
|
|
|
|
|
if voice_emotion in ["sad", "fear"]: |
|
advice.append("Patient's tone suggests anxiety. Provide clear explanations and emotional support.") |
|
elif voice_emotion == "angry": |
|
advice.append("Patient sounds upset. Practice active listening and validate their feelings.") |
|
elif voice_emotion == "happy": |
|
advice.append("Patient seems positive. This may be a good time to discuss treatment options.") |
|
|
|
return "\n".join(advice) if advice else "Patient appears neutral. Continue with consultation." |
|
|
|
def process_input(video, audio): |
|
"""Process video and audio inputs to detect emotions""" |
|
try: |
|
|
|
if video is not None: |
|
frame = cv2.cvtColor(video, cv2.COLOR_RGB2BGR) |
|
face_emotion, face_details = analyze_face(frame) |
|
else: |
|
face_emotion, face_details = "neutral", {} |
|
|
|
|
|
if audio is not None: |
|
voice_emotion, voice_details = analyze_voice(audio) |
|
else: |
|
voice_emotion, voice_details = "neutral", {} |
|
|
|
|
|
update_em |