File size: 4,869 Bytes
c7ec63e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
babd923
 
 
 
 
 
 
 
 
c7ec63e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
babd923
 
 
 
 
 
 
 
 
 
 
c7ec63e
babd923
 
 
 
 
 
c7ec63e
babd923
 
 
 
c7ec63e
 
 
 
 
 
 
 
 
 
0422b31
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import joblib
import numpy as np
import librosa
import gradio as gr
from huggingface_hub import hf_hub_download
from deepface import DeepFace
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# --- 1. 下載並載入 SVM 模型 ---
# 這裡 repo_id 填你的模型倉庫路徑,例如 "GCLing/emotion-svm-model"
# filename 填上傳到該倉庫的檔案名,例如 "svm_emotion_model.joblib"
print("Downloading SVM model from Hugging Face Hub...")
model_path = hf_hub_download(repo_id="GCLing/emotion-svm-model", filename="svm_emotion_model.joblib")
print(f"SVM model downloaded to: {model_path}")
svm_model = joblib.load(model_path)
print("SVM model loaded.")

# --- 2. 載入文字情緒分析模型 ---
# 以 uer/roberta-base-finetuned-chinanews-chinese 為例;可替換成其他合適的中文情感分類模型
print("Loading text sentiment model...")
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese")
model_txt = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese")
text_emotion = pipeline("sentiment-analysis", model=model_txt, tokenizer=tokenizer)
print("Text sentiment model loaded.")

# --- 3. 聲音特徵擷取函式 ---
def extract_feature(signal: np.ndarray, sr: int) -> np.ndarray:
    """
    從一段音訊 signal (numpy array) 和取樣率 sr 計算 MFCC 特徵 (13 維),
    並回傳平均與變異組成的特徵向量 (共 26 維)。
    """
    # librosa 載入後 signal 為 float numpy array
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)
    # axis=1: 每個 MFCC 維度對時間做平均與變異數
    return np.concatenate([np.mean(mfcc, axis=1), np.var(mfcc, axis=1)])

# --- 4. 三種預測函式 ---

def predict_face(img: np.ndarray):
    if img is None:
        return {}  # 没有帧时返回空
    try:
        result = DeepFace.analyze(img, actions=["emotion"], detector_backend="opencv")
        return result.get("emotion", {})
    except Exception as e:
        # 遇到错误时,可返回空或日志
        print("DeepFace 分析错误:", e)
        return {}

def predict_voice(audio):
    """
    語音情緒分析:audio 由 Gradio 傳入,形式為暫存檔路徑字串 (str)。
    用 librosa.load 讀取,再提取 MFCC 特徵,最後用 SVM 模型 predict_proba。
    回傳格式為 dict,例如 {"angry":0.1, "happy":0.7, ...}
    """
    # audio 參數為 Gradio Audio 組件給的檔案路徑
    signal, sr = librosa.load(audio, sr=None)
    feat = extract_feature(signal, sr)
    probs = svm_model.predict_proba([feat])[0]
    labels = svm_model.classes_
    return {labels[i]: float(probs[i]) for i in range(len(labels))}

def predict_text(text: str):
    """
    文字情緒分析:使用 transformers pipeline,
    輸入中文字串,回傳 dict,例如 {"POSITIVE":0.95} 或模型輸出標籤與信心分數。
    """
    if not text or text.strip() == "":
        return {}
    pred = text_emotion(text)[0]
    # pred 形如 {"label": "...", "score": ...}
    return {pred["label"]: float(pred["score"])}

# --- 5. 建立 Gradio 介面 ---
with gr.Blocks() as demo:
    gr.Markdown("## 多模態即時情緒分析")
    with gr.Tabs():
        # 臉部情緒 Tab
        with gr.TabItem("臉部情緒"):
            gr.Markdown("### 臉部情緒 (即時 Webcam Streaming 分析)")
            with gr.Row():
                webcam = gr.Image(sources="webcam", streaming=True, type="numpy", label="攝像頭畫面")
                emotion_output = gr.Label(label="情緒分布")
            # 关键:用 stream 让每帧到达时调用 predict_face 并更新 emotion_output
            webcam.stream(fn=predict_face, inputs=webcam, outputs=emotion_output)

        # 其餘 Tab 可按原先寫法,或用 Blocks 方式
        with gr.TabItem("語音情緒"):
            audio = gr.Audio(sources="microphone", streaming=False, type="filepath", label="錄音")
            audio_output = gr.Label(label="語音情緒結果")
            # 用 change/submit 触发:录音结束后调用 predict_voice
            audio.change(fn=predict_voice, inputs=audio, outputs=audio_output)

        with gr.TabItem("文字情緒"):
            text = gr.Textbox(lines=3, placeholder="請輸入中文文字…")
            text_output = gr.Label(label="文字情緒結果")
            text.submit(fn=predict_text, inputs=text, outputs=text_output)

    # 三合一 Tabs
    app = gr.TabbedInterface(
        interface_list=[face_interface, voice_interface, text_interface],
        tab_names=["臉部情緒", "語音情緒", "文字情緒"]
    )
    return app

if __name__ == "__main__":
    demo = build_interface()
    demo.launch(share=False)
  # 不要传 server_name 或 server_port