File size: 7,689 Bytes
18c46ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# tabs/speech_emotion_recognition.py

import gradio as gr
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from transformers import pipeline
import torch
import tempfile
import warnings
import os

# Suppress specific warnings from transformers if needed
warnings.filterwarnings("ignore", category=UserWarning, module='transformers')

# Determine the device
def get_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using MPS device for inference.")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using CUDA device for inference.")
    else:
        device = torch.device("cpu")
        print("Using CPU for inference.")
    return device

device = get_device()

# Initialize the pipelines with the specified device
try:
    emotion_model = pipeline(
        "audio-classification",
        model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
        device=0 if device.type == "cuda" else ("mps" if device.type == "mps" else -1)
    )
    print("Emotion model loaded successfully.")
except Exception as e:
    print(f"Error loading emotion model: {e}")
    emotion_model = None

try:
    transcription_model = pipeline(
        "automatic-speech-recognition",
        model="facebook/wav2vec2-base-960h",
        device=0 if device.type == "cuda" else ("mps" if device.type == "mps" else -1)
    )
    print("Transcription model loaded successfully.")
except Exception as e:
    print(f"Error loading transcription model: {e}")
    transcription_model = None

# Emotion Mapping
emotion_mapping = {
    "angry": (0.8, 0.8, -0.5),
    "happy": (0.6, 0.6, 0.8),
    "sad": (-0.6, -0.4, -0.6),
    "neutral": (0, 0, 0),
    "fear": (0.3, -0.3, -0.7),
    "surprise": (0.4, 0.2, 0.2),
    "disgust": (0.2, 0.5, -0.6),
    "calm": (-0.2, 0.1, 0.3),
    "excited": (0.7, 0.5, 0.7),
    "frustrated": (0.6, 0.5, -0.4)
}

def process_audio_emotion(audio_file):
    """
    Processes the input audio file to perform transcription and emotion recognition.
    Generates waveform and mel spectrogram plots.

    Returns:
        A tuple containing:
        - Transcription (str)
        - Emotion (str)
        - Confidence (%) (float)
        - Arousal (float)
        - Dominance (float)
        - Valence (float)
        - Waveform Plot (str: filepath)
        - Mel Spectrogram Plot (str: filepath)
    """
    if not audio_file:
        return (
            "No audio file provided.",  # Transcription (textbox)
            None,                       # Emotion (textbox)
            None,                       # Confidence (%) (number)
            None,                       # Arousal (number)
            None,                       # Dominance (number)
            None,                       # Valence (number)
            None,                       # Waveform Plot (image)
            None                        # Mel Spectrogram Plot (image)
        )

    try:
        y, sr = librosa.load(audio_file, sr=None)

        # Transcription
        if transcription_model:
            transcription_result = transcription_model(audio_file)
            transcription = transcription_result.get("text", "N/A")
        else:
            transcription = "Transcription model not loaded."

        # Emotion Recognition
        if emotion_model:
            emotion_results = emotion_model(audio_file)
            if emotion_results:
                emotion_result = emotion_results[0]
                emotion = emotion_result.get("label", "Unknown").lower()
                confidence = emotion_result.get("score", 0.0) * 100  # Convert to percentage
                arousal, dominance, valence = emotion_mapping.get(emotion, (0.0, 0.0, 0.0))
            else:
                emotion = "No emotion detected."
                confidence = 0.0
                arousal, dominance, valence = 0.0, 0.0, 0.0
        else:
            emotion = "Emotion model not loaded."
            confidence = 0.0
            arousal, dominance, valence = 0.0, 0.0, 0.0

        # Plotting Waveform
        plt.figure(figsize=(10, 4))
        librosa.display.waveshow(y, sr=sr)
        plt.title("Waveform")
        plt.xlabel("Time (s)")
        plt.ylabel("Amplitude")
        with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_waveform:
            plt.savefig(tmp_waveform.name, bbox_inches='tight')
            waveform_plot_path = tmp_waveform.name
        plt.close()

        # Plotting Mel Spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(librosa.power_to_db(mel_spec, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
        plt.colorbar(format='%+2.0f dB')
        plt.title("Mel Spectrogram")
        with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_mel:
            plt.savefig(tmp_mel.name, bbox_inches='tight')
            mel_spec_plot_path = tmp_mel.name
        plt.close()

        return (
            transcription,                  # Transcription (textbox)
            emotion.capitalize(),           # Emotion (textbox)
            confidence,                     # Confidence (%) (number)
            arousal,                        # Arousal (number)
            dominance,                      # Dominance (number)
            valence,                        # Valence (number)
            waveform_plot_path,             # Waveform Plot (image)
            mel_spec_plot_path              # Mel Spectrogram Plot (image)
        )
    except Exception as e:
        return (
            f"Error: {str(e)}",  # Transcription (textbox)
            None,                 # Emotion (textbox)
            None,                 # Confidence (%) (number)
            None,                 # Arousal (number)
            None,                 # Dominance (number)
            None,                 # Valence (number)
            None,                 # Waveform Plot (image)
            None                  # Mel Spectrogram Plot (image)
        )

def create_emotion_recognition_tab():
    """
    Creates the Emotion Recognition tab in the Gradio interface.
    """
    with gr.Row():
        with gr.Column(scale=2):
            input_audio = gr.Audio(label="Input Audio", type="filepath")
            gr.Examples(
                examples=["./assets/audio/fitness.wav"],
                inputs=[input_audio],
                label="Examples"
            )
        with gr.Column(scale=1):
            transcription_output = gr.Textbox(label="Transcription", interactive=False)
            emotion_output = gr.Textbox(label="Emotion", interactive=False)
            confidence_output = gr.Number(label="Confidence (%)", interactive=False)
            arousal_output = gr.Number(label="Arousal (Level of Energy)", interactive=False)
            dominance_output = gr.Number(label="Dominance (Degree of Control)", interactive=False)
            valence_output = gr.Number(label="Valence (Positivity/Negativity)", interactive=False)
        with gr.Column(scale=1):
            waveform_plot = gr.Image(label="Waveform")
            mel_spec_plot = gr.Image(label="Mel Spectrogram")

    input_audio.change(
        fn=process_audio_emotion,
        inputs=[input_audio],
        outputs=[
            transcription_output,
            emotion_output,
            confidence_output,
            arousal_output,
            dominance_output,
            valence_output,
            waveform_plot,
            mel_spec_plot
        ]
    )

# Call create_emotion_recognition_tab to create the Gradio interface