pcreem commited on
Commit
2b7c233
·
1 Parent(s): b5ff674
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
__pycache__/feature.cpython-312.pyc ADDED
Binary file (3.63 kB). View file
 
app-origin.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisper
3
+ from feature import AudioTextEmotionModel, extract_audio_features
4
+ import torch
5
+
6
+ # 設定設備
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+
9
+ # 載入情緒辨識模型
10
+ emotion_model = AudioTextEmotionModel(audio_input_dim=180, text_input_dim=768, hidden_dim=128, output_dim=3)
11
+ emotion_model.load_state_dict(torch.load("model_weights.pth", map_location=device))
12
+ emotion_model.to(device)
13
+ emotion_model.eval()
14
+
15
+ # 載入 Whisper 模型進行語音轉文字
16
+ whisper_model = whisper.load_model("base")
17
+ EMOTION_LABELS = {0: '正面', 1: '中性', 2: '負面'}
18
+
19
+ def predict_emotion(audio_path):
20
+ result = whisper_model.transcribe(audio_path, language="zh")
21
+ text = result["text"]
22
+ audio_feat = extract_audio_features(audio_path)
23
+ audio_tensor = torch.tensor(audio_feat, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
24
+
25
+ with torch.no_grad():
26
+ output = emotion_model(audio_tensor, torch.zeros(1, 1, 768).to(device)) # dummy text input
27
+ pred = torch.argmax(output, dim=1).item()
28
+
29
+ return f"語音轉文字結果:{text}\n預測情緒:{EMOTION_LABELS[pred]}"
30
+
31
+ def create_interface():
32
+ with gr.Blocks() as demo:
33
+ gr.Markdown("### 🎧 中文語音情緒辨識(EATD)\n說一段話,我會判斷你的情緒(正面 / 中性 / 負面)")
34
+ audio_input = gr.Audio(sources=["microphone"], type="filepath", label="請錄音")
35
+ output = gr.Textbox()
36
+ btn = gr.Button("分析")
37
+ btn.click(fn=predict_emotion, inputs=audio_input, outputs=output)
38
+ return demo
39
+
40
+ demo = create_interface()
41
+ demo.launch(share=True)
42
+
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisper
3
+ import torch
4
+ import numpy as np
5
+ from feature import (
6
+ AudioTextEmotionModel,
7
+ extract_audio_features,
8
+ extract_text_features
9
+ )
10
+
11
+ # 設定設備
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+
14
+ # 載入模型
15
+ emotion_model = AudioTextEmotionModel(audio_input_dim=180, text_input_dim=768, hidden_dim=128, output_dim=3)
16
+ emotion_model.load_state_dict(torch.load("model_weights.pth", map_location=device))
17
+ emotion_model.to(device)
18
+ emotion_model.eval()
19
+
20
+ # Whisper 模型
21
+ whisper_model = whisper.load_model("base")
22
+ EMOTION_LABELS = {0: '正面', 1: '中性', 2: '負面'}
23
+
24
+ # 情緒預測主函式(支援語音 / 文字 / 雙模)
25
+ def analyze_input(audio, text_input):
26
+ audio_feat = None
27
+ text_feat = None
28
+ result_text = ""
29
+
30
+ # 若有語音輸入
31
+ if audio:
32
+ result = whisper_model.transcribe(audio, language="zh")
33
+ transcribed_text = result["text"]
34
+ result_text += f"🎧 語音轉文字:「{transcribed_text}」\n"
35
+ audio_feat = extract_audio_features(audio)
36
+ else:
37
+ transcribed_text = None
38
+
39
+ # 若有文字輸入(用戶輸入或語音轉出)
40
+ text = text_input or transcribed_text
41
+ if text:
42
+ text_feat = extract_text_features(text)
43
+ result_text += f"✏️ 文字內容:「{text}」\n"
44
+
45
+ if audio_feat is None and text_feat is None:
46
+ return "請提供語音或文字輸入進行情緒辨識。"
47
+
48
+ # 製作 tensor 輸入
49
+ audio_tensor = (
50
+ torch.tensor(audio_feat, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
51
+ if audio_feat is not None else
52
+ torch.zeros(1, 1, 180).to(device)
53
+ )
54
+ text_tensor = (
55
+ torch.tensor(text_feat, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
56
+ if text_feat is not None else
57
+ torch.zeros(1, 1, 768).to(device)
58
+ )
59
+
60
+ with torch.no_grad():
61
+ output = emotion_model(audio_tensor, text_tensor)
62
+ pred = torch.argmax(output, dim=1).item()
63
+
64
+ result_text += f"📊 預測情緒:{EMOTION_LABELS[pred]}"
65
+ return result_text
66
+
67
+ # Gradio Chat UI
68
+ with gr.Blocks() as demo:
69
+ gr.Markdown("## 🎧 中文語音情緒辨識聊天機器人\n支援語音輸入、文字輸入,或兩者結合分析")
70
+
71
+ chatbot = gr.Chatbot()
72
+ with gr.Row():
73
+ audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="語音")
74
+ text_input = gr.Textbox(lines=2, placeholder="輸入文字內容...", label="文字")
75
+ send_btn = gr.Button("送出分析")
76
+
77
+ def chat_handler(audio, text, history):
78
+ response = analyze_input(audio, text)
79
+ history = history or []
80
+ history.append(("👤", response))
81
+ return history, None, ""
82
+
83
+ send_btn.click(fn=chat_handler,
84
+ inputs=[audio_input, text_input, chatbot],
85
+ outputs=[chatbot, audio_input, text_input])
86
+
87
+ demo.launch(share=True)
apt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
feature.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # feature.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import numpy as np
6
+ import librosa
7
+ from transformers import BertTokenizer, BertModel
8
+
9
+ # === 模型結構 ===
10
+ class AudioTextEmotionModel(nn.Module):
11
+ def __init__(self, audio_input_dim, text_input_dim, hidden_dim, output_dim):
12
+ super(AudioTextEmotionModel, self).__init__()
13
+ self.audio_gru = nn.GRU(audio_input_dim, hidden_dim, batch_first=True)
14
+ self.audio_bilstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
15
+ self.text_bilstm = nn.LSTM(text_input_dim, hidden_dim, batch_first=True, bidirectional=True)
16
+ self.fc = nn.Linear(hidden_dim * 4, output_dim)
17
+ self.softmax = nn.Softmax(dim=1)
18
+
19
+ def forward(self, audio_input, text_input):
20
+ audio_out, _ = self.audio_gru(audio_input)
21
+ audio_out, _ = self.audio_bilstm(audio_out)
22
+ text_out, _ = self.text_bilstm(text_input)
23
+ combined = torch.cat((audio_out[:, -1, :], text_out[:, -1, :]), dim=1)
24
+ output = self.fc(combined)
25
+ return self.softmax(output)
26
+
27
+ # === 音訊特徵萃取 ===
28
+ def extract_audio_features(file_path):
29
+ y, sr = librosa.load(file_path, sr=None)
30
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
31
+ chroma = librosa.feature.chroma_stft(y=y, sr=sr)
32
+ spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
33
+ features = np.concatenate((
34
+ np.mean(mfcc, axis=1),
35
+ np.mean(chroma, axis=1),
36
+ np.mean(spec, axis=1)
37
+ ))
38
+ return features
39
+
40
+ # === 文字特徵萃取(使用 BERT) ===
41
+ tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
42
+ bert_model = BertModel.from_pretrained("bert-base-chinese")
43
+
44
+ def extract_text_features(text):
45
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
46
+ outputs = bert_model(**inputs)
47
+ cls_embedding = outputs.last_hidden_state[:, 0, :]
48
+ return cls_embedding.squeeze().detach().numpy()
model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02f3ffdd54b161379089ddfb318f3b231de4e6754a186962459c38178d305627
3
+ size 5225033
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ git+https://github.com/openai/whisper.git
4
+ gradio>=4.44.0
5
+ librosa
6
+ numpy
7
+ transformers
8
+ pydantic>=2.0.0
9
+