hi
Browse files- .gradio/certificate.pem +31 -0
- __pycache__/feature.cpython-312.pyc +0 -0
- app-origin.py +42 -0
- app.py +87 -0
- apt.txt +1 -0
- feature.py +48 -0
- model_weights.pth +3 -0
- requirements.txt +9 -0
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
__pycache__/feature.cpython-312.pyc
ADDED
Binary file (3.63 kB). View file
|
|
app-origin.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import whisper
|
3 |
+
from feature import AudioTextEmotionModel, extract_audio_features
|
4 |
+
import torch
|
5 |
+
|
6 |
+
# 設定設備
|
7 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
8 |
+
|
9 |
+
# 載入情緒辨識模型
|
10 |
+
emotion_model = AudioTextEmotionModel(audio_input_dim=180, text_input_dim=768, hidden_dim=128, output_dim=3)
|
11 |
+
emotion_model.load_state_dict(torch.load("model_weights.pth", map_location=device))
|
12 |
+
emotion_model.to(device)
|
13 |
+
emotion_model.eval()
|
14 |
+
|
15 |
+
# 載入 Whisper 模型進行語音轉文字
|
16 |
+
whisper_model = whisper.load_model("base")
|
17 |
+
EMOTION_LABELS = {0: '正面', 1: '中性', 2: '負面'}
|
18 |
+
|
19 |
+
def predict_emotion(audio_path):
|
20 |
+
result = whisper_model.transcribe(audio_path, language="zh")
|
21 |
+
text = result["text"]
|
22 |
+
audio_feat = extract_audio_features(audio_path)
|
23 |
+
audio_tensor = torch.tensor(audio_feat, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
|
24 |
+
|
25 |
+
with torch.no_grad():
|
26 |
+
output = emotion_model(audio_tensor, torch.zeros(1, 1, 768).to(device)) # dummy text input
|
27 |
+
pred = torch.argmax(output, dim=1).item()
|
28 |
+
|
29 |
+
return f"語音轉文字結果:{text}\n預測情緒:{EMOTION_LABELS[pred]}"
|
30 |
+
|
31 |
+
def create_interface():
|
32 |
+
with gr.Blocks() as demo:
|
33 |
+
gr.Markdown("### 🎧 中文語音情緒辨識(EATD)\n說一段話,我會判斷你的情緒(正面 / 中性 / 負面)")
|
34 |
+
audio_input = gr.Audio(sources=["microphone"], type="filepath", label="請錄音")
|
35 |
+
output = gr.Textbox()
|
36 |
+
btn = gr.Button("分析")
|
37 |
+
btn.click(fn=predict_emotion, inputs=audio_input, outputs=output)
|
38 |
+
return demo
|
39 |
+
|
40 |
+
demo = create_interface()
|
41 |
+
demo.launch(share=True)
|
42 |
+
|
app.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import whisper
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
from feature import (
|
6 |
+
AudioTextEmotionModel,
|
7 |
+
extract_audio_features,
|
8 |
+
extract_text_features
|
9 |
+
)
|
10 |
+
|
11 |
+
# 設定設備
|
12 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
13 |
+
|
14 |
+
# 載入模型
|
15 |
+
emotion_model = AudioTextEmotionModel(audio_input_dim=180, text_input_dim=768, hidden_dim=128, output_dim=3)
|
16 |
+
emotion_model.load_state_dict(torch.load("model_weights.pth", map_location=device))
|
17 |
+
emotion_model.to(device)
|
18 |
+
emotion_model.eval()
|
19 |
+
|
20 |
+
# Whisper 模型
|
21 |
+
whisper_model = whisper.load_model("base")
|
22 |
+
EMOTION_LABELS = {0: '正面', 1: '中性', 2: '負面'}
|
23 |
+
|
24 |
+
# 情緒預測主函式(支援語音 / 文字 / 雙模)
|
25 |
+
def analyze_input(audio, text_input):
|
26 |
+
audio_feat = None
|
27 |
+
text_feat = None
|
28 |
+
result_text = ""
|
29 |
+
|
30 |
+
# 若有語音輸入
|
31 |
+
if audio:
|
32 |
+
result = whisper_model.transcribe(audio, language="zh")
|
33 |
+
transcribed_text = result["text"]
|
34 |
+
result_text += f"🎧 語音轉文字:「{transcribed_text}」\n"
|
35 |
+
audio_feat = extract_audio_features(audio)
|
36 |
+
else:
|
37 |
+
transcribed_text = None
|
38 |
+
|
39 |
+
# 若有文字輸入(用戶輸入或語音轉出)
|
40 |
+
text = text_input or transcribed_text
|
41 |
+
if text:
|
42 |
+
text_feat = extract_text_features(text)
|
43 |
+
result_text += f"✏️ 文字內容:「{text}」\n"
|
44 |
+
|
45 |
+
if audio_feat is None and text_feat is None:
|
46 |
+
return "請提供語音或文字輸入進行情緒辨識。"
|
47 |
+
|
48 |
+
# 製作 tensor 輸入
|
49 |
+
audio_tensor = (
|
50 |
+
torch.tensor(audio_feat, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
|
51 |
+
if audio_feat is not None else
|
52 |
+
torch.zeros(1, 1, 180).to(device)
|
53 |
+
)
|
54 |
+
text_tensor = (
|
55 |
+
torch.tensor(text_feat, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
|
56 |
+
if text_feat is not None else
|
57 |
+
torch.zeros(1, 1, 768).to(device)
|
58 |
+
)
|
59 |
+
|
60 |
+
with torch.no_grad():
|
61 |
+
output = emotion_model(audio_tensor, text_tensor)
|
62 |
+
pred = torch.argmax(output, dim=1).item()
|
63 |
+
|
64 |
+
result_text += f"📊 預測情緒:{EMOTION_LABELS[pred]}"
|
65 |
+
return result_text
|
66 |
+
|
67 |
+
# Gradio Chat UI
|
68 |
+
with gr.Blocks() as demo:
|
69 |
+
gr.Markdown("## 🎧 中文語音情緒辨識聊天機器人\n支援語音輸入、文字輸入,或兩者結合分析")
|
70 |
+
|
71 |
+
chatbot = gr.Chatbot()
|
72 |
+
with gr.Row():
|
73 |
+
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="語音")
|
74 |
+
text_input = gr.Textbox(lines=2, placeholder="輸入文字內容...", label="文字")
|
75 |
+
send_btn = gr.Button("送出分析")
|
76 |
+
|
77 |
+
def chat_handler(audio, text, history):
|
78 |
+
response = analyze_input(audio, text)
|
79 |
+
history = history or []
|
80 |
+
history.append(("👤", response))
|
81 |
+
return history, None, ""
|
82 |
+
|
83 |
+
send_btn.click(fn=chat_handler,
|
84 |
+
inputs=[audio_input, text_input, chatbot],
|
85 |
+
outputs=[chatbot, audio_input, text_input])
|
86 |
+
|
87 |
+
demo.launch(share=True)
|
apt.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
feature.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# feature.py
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import numpy as np
|
6 |
+
import librosa
|
7 |
+
from transformers import BertTokenizer, BertModel
|
8 |
+
|
9 |
+
# === 模型結構 ===
|
10 |
+
class AudioTextEmotionModel(nn.Module):
|
11 |
+
def __init__(self, audio_input_dim, text_input_dim, hidden_dim, output_dim):
|
12 |
+
super(AudioTextEmotionModel, self).__init__()
|
13 |
+
self.audio_gru = nn.GRU(audio_input_dim, hidden_dim, batch_first=True)
|
14 |
+
self.audio_bilstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
|
15 |
+
self.text_bilstm = nn.LSTM(text_input_dim, hidden_dim, batch_first=True, bidirectional=True)
|
16 |
+
self.fc = nn.Linear(hidden_dim * 4, output_dim)
|
17 |
+
self.softmax = nn.Softmax(dim=1)
|
18 |
+
|
19 |
+
def forward(self, audio_input, text_input):
|
20 |
+
audio_out, _ = self.audio_gru(audio_input)
|
21 |
+
audio_out, _ = self.audio_bilstm(audio_out)
|
22 |
+
text_out, _ = self.text_bilstm(text_input)
|
23 |
+
combined = torch.cat((audio_out[:, -1, :], text_out[:, -1, :]), dim=1)
|
24 |
+
output = self.fc(combined)
|
25 |
+
return self.softmax(output)
|
26 |
+
|
27 |
+
# === 音訊特徵萃取 ===
|
28 |
+
def extract_audio_features(file_path):
|
29 |
+
y, sr = librosa.load(file_path, sr=None)
|
30 |
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
|
31 |
+
chroma = librosa.feature.chroma_stft(y=y, sr=sr)
|
32 |
+
spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
|
33 |
+
features = np.concatenate((
|
34 |
+
np.mean(mfcc, axis=1),
|
35 |
+
np.mean(chroma, axis=1),
|
36 |
+
np.mean(spec, axis=1)
|
37 |
+
))
|
38 |
+
return features
|
39 |
+
|
40 |
+
# === 文字特徵萃取(使用 BERT) ===
|
41 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
|
42 |
+
bert_model = BertModel.from_pretrained("bert-base-chinese")
|
43 |
+
|
44 |
+
def extract_text_features(text):
|
45 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
|
46 |
+
outputs = bert_model(**inputs)
|
47 |
+
cls_embedding = outputs.last_hidden_state[:, 0, :]
|
48 |
+
return cls_embedding.squeeze().detach().numpy()
|
model_weights.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02f3ffdd54b161379089ddfb318f3b231de4e6754a186962459c38178d305627
|
3 |
+
size 5225033
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchaudio
|
3 |
+
git+https://github.com/openai/whisper.git
|
4 |
+
gradio>=4.44.0
|
5 |
+
librosa
|
6 |
+
numpy
|
7 |
+
transformers
|
8 |
+
pydantic>=2.0.0
|
9 |
+
|