|
|
|
|
|
|
|
import os
|
|
import joblib
|
|
import numpy as np
|
|
import librosa
|
|
import gradio as gr
|
|
from huggingface_hub import hf_hub_download
|
|
from deepface import DeepFace
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
|
|
|
|
|
|
|
|
|
print("Downloading SVM model from Hugging Face Hub...")
|
|
model_path = hf_hub_download(repo_id="GCLing/emotion-svm-model", filename="svm_emotion_model.joblib")
|
|
print(f"SVM model downloaded to: {model_path}")
|
|
svm_model = joblib.load(model_path)
|
|
print("SVM model loaded.")
|
|
|
|
|
|
|
|
print("Loading text sentiment model...")
|
|
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese")
|
|
model_txt = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese")
|
|
text_emotion = pipeline("sentiment-analysis", model=model_txt, tokenizer=tokenizer)
|
|
print("Text sentiment model loaded.")
|
|
|
|
|
|
def extract_feature(signal: np.ndarray, sr: int) -> np.ndarray:
|
|
"""
|
|
從一段音訊 signal (numpy array) 和取樣率 sr 計算 MFCC 特徵 (13 維),
|
|
並回傳平均與變異組成的特徵向量 (共 26 維)。
|
|
"""
|
|
|
|
mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)
|
|
|
|
return np.concatenate([np.mean(mfcc, axis=1), np.var(mfcc, axis=1)])
|
|
|
|
|
|
|
|
def predict_face(img: np.ndarray):
|
|
"""
|
|
臉部情緒分析:使用 DeepFace 分析單張影像 (numpy array, HxWx3)。
|
|
強制使用 OpenCV 後端以避免 retinaface/tf 版本衝突。
|
|
回傳格式為 dict,例如 {"happy": 0.80, "sad": 0.05, ...}
|
|
"""
|
|
|
|
result = DeepFace.analyze(img, actions=["emotion"], detector_backend="opencv")
|
|
|
|
return result["emotion"]
|
|
|
|
def predict_voice(audio):
|
|
"""
|
|
語音情緒分析:audio 由 Gradio 傳入,形式為暫存檔路徑字串 (str)。
|
|
用 librosa.load 讀取,再提取 MFCC 特徵,最後用 SVM 模型 predict_proba。
|
|
回傳格式為 dict,例如 {"angry":0.1, "happy":0.7, ...}
|
|
"""
|
|
|
|
signal, sr = librosa.load(audio, sr=None)
|
|
feat = extract_feature(signal, sr)
|
|
probs = svm_model.predict_proba([feat])[0]
|
|
labels = svm_model.classes_
|
|
return {labels[i]: float(probs[i]) for i in range(len(labels))}
|
|
|
|
def predict_text(text: str):
|
|
"""
|
|
文字情緒分析:使用 transformers pipeline,
|
|
輸入中文字串,回傳 dict,例如 {"POSITIVE":0.95} 或模型輸出標籤與信心分數。
|
|
"""
|
|
if not text or text.strip() == "":
|
|
return {}
|
|
pred = text_emotion(text)[0]
|
|
|
|
return {pred["label"]: float(pred["score"])}
|
|
|
|
|
|
def build_interface():
|
|
"""
|
|
建立一個 TabbedInterface,包含三個子 Interface:
|
|
- 臉部情緒 (Webcam 拍照或上傳)
|
|
- 語音情緒 (錄音或上傳音檔)
|
|
- 文字情緒 (文字輸入)
|
|
"""
|
|
|
|
face_interface = gr.Interface(
|
|
fn=predict_face,
|
|
inputs=gr.Image(sources="webcam", streaming=True, type="numpy"),
|
|
outputs=gr.Label(num_top_classes=1),
|
|
title="臉部情緒 (即時 Webcam)",
|
|
description="允許攝影機拍照後自動分析當前表情的情緒分佈。"
|
|
)
|
|
|
|
|
|
voice_interface = gr.Interface(
|
|
fn=predict_voice,
|
|
inputs=gr.Audio(sources="microphone", type="filepath"),
|
|
outputs=gr.Label(num_top_classes=1),
|
|
title="語音情緒",
|
|
description="錄製語音或上傳音訊檔,模型會回傳「驚訝/生氣/開心/悲傷/害怕」五種情緒機率。"
|
|
)
|
|
|
|
|
|
text_interface = gr.Interface(
|
|
fn=predict_text,
|
|
inputs=gr.Textbox(lines=3, placeholder="請輸入中文文字…"),
|
|
outputs=gr.Label(num_top_classes=1),
|
|
title="文字情緒",
|
|
description="輸入中文文字,即時判斷文字情緒並回傳標籤與信心分數。"
|
|
)
|
|
|
|
|
|
app = gr.TabbedInterface(
|
|
interface_list=[face_interface, voice_interface, text_interface],
|
|
tab_names=["臉部情緒", "語音情緒", "文字情緒"]
|
|
)
|
|
return app
|
|
|
|
if __name__ == "__main__":
|
|
|
|
demo = build_interface()
|
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7861, share=True)
|
|
|