ducdatit2002 commited on
Commit
a4d00d9
·
verified ·
1 Parent(s): 6380398

Upload 6 files

Browse files
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torchaudio
4
+ import numpy as np
5
+ import gradio as gr
6
+ from transformers import AutoFeatureExtractor, HubertForSequenceClassification
7
+
8
+ # ==== 1. Cấu hình đường dẫn và thiết bị ====
9
+ MODEL_PATH = "./voice_emotion_checkpoint" # Thay đổi nếu cần
10
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+
12
+ # ==== 2. Load feature extractor và model ====
13
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
14
+ model = HubertForSequenceClassification.from_pretrained(MODEL_PATH).to(DEVICE)
15
+ model.eval()
16
+
17
+ # Nếu bạn có file id2label.json:
18
+ # import json
19
+ # with open(os.path.join(MODEL_PATH, "id2label.json"), "r", encoding="utf-8") as f:
20
+ # id2label = json.load(f)
21
+ # Ngược lại:
22
+ id2label = {int(k): v for k, v in model.config.id2label.items()}
23
+
24
+ # ==== 3. Hàm xử lý và dự đoán ====
25
+ def predict_emotion(audio_filepath):
26
+ # 1) Load file và chuyển về numpy
27
+ waveform, sr = torchaudio.load(audio_filepath) # waveform: Tensor[chân âm][time]
28
+ waveform = waveform.numpy() # -> numpy array
29
+ # 2) Stereo -> mono
30
+ if waveform.ndim > 1:
31
+ waveform = np.mean(waveform, axis=0)
32
+ # 3) Resample về 16 kHz nếu cần
33
+ target_sr = feature_extractor.sampling_rate
34
+ if sr != target_sr:
35
+ waveform = torchaudio.functional.resample(
36
+ torch.from_numpy(waveform), orig_freq=sr, new_freq=target_sr
37
+ ).numpy()
38
+ sr = target_sr
39
+ # 4) Feature extraction
40
+ inputs = feature_extractor(
41
+ waveform,
42
+ sampling_rate=sr,
43
+ return_tensors="pt",
44
+ padding=True
45
+ )
46
+ input_values = inputs.input_values.to(DEVICE)
47
+ # 5) Inference
48
+ with torch.no_grad():
49
+ logits = model(input_values).logits.cpu().numpy()[0]
50
+ probs = torch.softmax(torch.from_numpy(logits), dim=-1).numpy()
51
+ pred_id = int(np.argmax(probs))
52
+ # 6) Chuẩn bị output
53
+ pred_label = id2label[pred_id]
54
+ label_probs = {id2label[i]: float(probs[i]) for i in range(len(probs))}
55
+ return pred_label, label_probs
56
+
57
+ # ==== 4. Xây dựng giao diện Gradio ====
58
+ demo = gr.Interface(
59
+ fn=predict_emotion,
60
+ inputs=gr.Audio(type="filepath", label="Upload or Record Audio"),
61
+ outputs=[
62
+ gr.Label(num_top_classes=1, label="Predicted Emotion"),
63
+ gr.Label(num_top_classes=len(id2label), label="All Probabilities"),
64
+ ],
65
+ title="Vietnamese Speech Emotion Recognition",
66
+ description="Upload hoặc record audio, mô hình sẽ dự đoán cảm xúc (angry, happy, sad, …).",
67
+ )
68
+
69
+ if __name__ == "__main__":
70
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch>=1.12.0
2
+ torchaudio>=0.12.0
3
+ transformers>=4.21.0
4
+ datasets>=2.0.0
5
+ evaluate>=0.4.0
6
+ numpy>=1.21.0
7
+ scikit-learn>=1.0.0
8
+ gradio>=3.0
voice_emotion_checkpoint/.DS_Store ADDED
Binary file (6.15 kB). View file
 
voice_emotion_checkpoint/config.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_pos_batch_norm": false,
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_dropout": 0.0,
45
+ "feat_extract_norm": "group",
46
+ "feat_proj_dropout": 0.1,
47
+ "feat_proj_layer_norm": true,
48
+ "final_dropout": 0.1,
49
+ "gradient_checkpointing": false,
50
+ "hidden_act": "gelu",
51
+ "hidden_dropout": 0.1,
52
+ "hidden_dropout_prob": 0.1,
53
+ "hidden_size": 768,
54
+ "id2label": {
55
+ "0": "angry",
56
+ "1": "fearful",
57
+ "2": "happy",
58
+ "3": "neutral",
59
+ "4": "sad",
60
+ "5": "surprised"
61
+ },
62
+ "initializer_range": 0.02,
63
+ "intermediate_size": 3072,
64
+ "label2id": {
65
+ "angry": 0,
66
+ "fearful": 1,
67
+ "happy": 2,
68
+ "neutral": 3,
69
+ "sad": 4,
70
+ "surprised": 5
71
+ },
72
+ "layer_norm_eps": 1e-05,
73
+ "layerdrop": 0.1,
74
+ "mask_feature_length": 10,
75
+ "mask_feature_min_masks": 0,
76
+ "mask_feature_prob": 0.0,
77
+ "mask_time_length": 10,
78
+ "mask_time_min_masks": 2,
79
+ "mask_time_prob": 0.05,
80
+ "model_type": "hubert",
81
+ "num_attention_heads": 12,
82
+ "num_conv_pos_embedding_groups": 16,
83
+ "num_conv_pos_embeddings": 128,
84
+ "num_feat_extract_layers": 7,
85
+ "num_hidden_layers": 12,
86
+ "pad_token_id": 0,
87
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
88
+ "torch_dtype": "float32",
89
+ "transformers_version": "4.51.3",
90
+ "use_weighted_layer_sum": false,
91
+ "vocab_size": 32
92
+ }
voice_emotion_checkpoint/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2174f5b573a35c131479df49f27896bf8bf00748a09a68388b1011e6986ed56
3
+ size 378306056
voice_emotion_checkpoint/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }