|
|
|
|
|
|
|
import os |
|
import joblib |
|
import numpy as np |
|
import librosa |
|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
from deepface import DeepFace |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline |
|
|
|
|
|
|
|
|
|
print("Downloading SVM model from Hugging Face Hub...") |
|
model_path = hf_hub_download(repo_id="GCLing/emotion-svm-model", filename="svm_emotion_model.joblib") |
|
print(f"SVM model downloaded to: {model_path}") |
|
svm_model = joblib.load(model_path) |
|
print("SVM model loaded.") |
|
|
|
|
|
|
|
print("Loading text sentiment model...") |
|
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese") |
|
model_txt = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese") |
|
text_emotion = pipeline("sentiment-analysis", model=model_txt, tokenizer=tokenizer) |
|
print("Text sentiment model loaded.") |
|
|
|
|
|
def extract_feature(signal: np.ndarray, sr: int) -> np.ndarray: |
|
""" |
|
從一段音訊 signal (numpy array) 和取樣率 sr 計算 MFCC 特徵 (13 維), |
|
並回傳平均與變異組成的特徵向量 (共 26 維)。 |
|
""" |
|
|
|
mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13) |
|
|
|
return np.concatenate([np.mean(mfcc, axis=1), np.var(mfcc, axis=1)]) |
|
|
|
|
|
|
|
def predict_face(img: np.ndarray): |
|
print("predict_face called, img is None?", img is None) |
|
|
|
try: |
|
result = DeepFace.analyze(img, actions=["emotion"], detector_backend="opencv") |
|
emo = result.get("emotion", {}) |
|
print("DeepFace result:", emo) |
|
return emo |
|
except Exception as e: |
|
print("DeepFace.analyze error:", e) |
|
return {} |
|
|
|
|
|
def predict_voice(audio): |
|
""" |
|
語音情緒分析:audio 由 Gradio 傳入,形式為暫存檔路徑字串 (str)。 |
|
用 librosa.load 讀取,再提取 MFCC 特徵,最後用 SVM 模型 predict_proba。 |
|
回傳格式為 dict,例如 {"angry":0.1, "happy":0.7, ...} |
|
""" |
|
|
|
signal, sr = librosa.load(audio, sr=None) |
|
feat = extract_feature(signal, sr) |
|
probs = svm_model.predict_proba([feat])[0] |
|
labels = svm_model.classes_ |
|
return {labels[i]: float(probs[i]) for i in range(len(labels))} |
|
|
|
def predict_text(text: str): |
|
|
|
def predict_text(text: str): |
|
print("predict_text called, text:", text) |
|
if not text or text.strip()=="": |
|
return {} |
|
try: |
|
pred = text_emotion(text)[0] |
|
result = {pred["label"]: float(pred["score"])} |
|
print("Text sentiment result:", result) |
|
return result |
|
except Exception as e: |
|
print("predict_text error:", e) |
|
return {} |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## 多模態即時情緒分析") |
|
with gr.Tabs(): |
|
|
|
with gr.TabItem("臉部情緒"): |
|
gr.Markdown("### 臉部情緒 (即時 Webcam Streaming 分析)") |
|
with gr.Row(): |
|
webcam = gr.Image(sources="webcam", streaming=True, type="numpy", label="攝像頭畫面") |
|
emotion_output = gr.Label(label="情緒分布") |
|
webcam.stream(fn=predict_face, inputs=webcam, outputs=emotion_output) |
|
|
|
|
|
|
|
with gr.TabItem("語音情緒"): |
|
audio = gr.Audio(sources="microphone", streaming=False, type="filepath", label="錄音") |
|
audio_output = gr.Label(label="語音情緒結果") |
|
|
|
audio.change(fn=predict_voice, inputs=audio, outputs=audio_output) |
|
|
|
with gr.TabItem("文字情緒"): |
|
text = gr.Textbox(lines=3, placeholder="請輸入中文文字…") |
|
text_output = gr.Label(label="文字情緒結果") |
|
btn = gr.Button("分析文字") |
|
btn.click(fn=predict_text, inputs=text, outputs=text_output) |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
demo.launch() |
|
|
|
|
|
|