gaur3009 commited on
Commit
2ec0359
·
verified ·
1 Parent(s): 9a1202a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import gradio as gr
4
+ import numpy as np
5
+ import librosa
6
+ import librosa.display
7
+ import matplotlib.pyplot as plt
8
+ import scipy.signal
9
+ from transformers import pipeline
10
+
11
+ # Load ASR model once (Wav2Vec2 large)
12
+ asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
13
+
14
+ # === DSP & feature extraction functions ===
15
+
16
+ def lowpass_filter(y, sr, cutoff=7000):
17
+ nyq = 0.5 * sr
18
+ norm_cutoff = cutoff / nyq
19
+ b, a = scipy.signal.butter(6, norm_cutoff, btype='low', analog=False)
20
+ y_filt = scipy.signal.lfilter(b, a, y)
21
+ return y_filt
22
+
23
+ def compute_mfcc(y, sr, n_mfcc=13):
24
+ # Pre-emphasis
25
+ y = scipy.signal.lfilter([1, -0.97], 1, y)
26
+ # Frame & window params
27
+ hop_length = int(0.010 * sr)
28
+ win_length = int(0.025 * sr)
29
+ # Compute MFCC
30
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc,
31
+ hop_length=hop_length, n_fft=512)
32
+ # Δ and ΔΔ
33
+ delta = librosa.feature.delta(mfcc)
34
+ delta2 = librosa.feature.delta(mfcc, order=2)
35
+ # Mean & variance normalization
36
+ mean = np.mean(mfcc, axis=1, keepdims=True)
37
+ std = np.std(mfcc, axis=1, keepdims=True)
38
+ mfcc_norm = (mfcc - mean) / (std + 1e-6)
39
+ return mfcc_norm, delta, delta2
40
+
41
+ def plot_features(y, sr, mfcc, delta, delta2):
42
+ fig, axs = plt.subplots(4, 1, figsize=(10, 10))
43
+ # Waveform
44
+ axs[0].set(title="Waveform")
45
+ librosa.display.waveshow(y, sr=sr, ax=axs[0])
46
+ # Spectrogram
47
+ S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=512, hop_length=256)
48
+ S_dB = librosa.power_to_db(S, ref=np.max)
49
+ librosa.display.specshow(S_dB, sr=sr, hop_length=256, x_axis='time', y_axis='mel', ax=axs[1])
50
+ axs[1].set(title="Mel Spectrogram")
51
+ # MFCC
52
+ librosa.display.specshow(mfcc, x_axis='time', ax=axs[2])
53
+ axs[2].set(title="MFCC (normalized)")
54
+ # Δ
55
+ librosa.display.specshow(delta, x_axis='time', ax=axs[3])
56
+ axs[3].set(title="Delta features")
57
+ plt.tight_layout()
58
+ return fig
59
+
60
+ def process(audio):
61
+ sr, y = audio
62
+ y = y.astype(np.float32)
63
+
64
+ # Lowpass filter
65
+ y_filt = lowpass_filter(y, sr)
66
+
67
+ # Compute features
68
+ mfcc, delta, delta2 = compute_mfcc(y_filt, sr)
69
+
70
+ # Run ASR (Wav2Vec2)
71
+ text = asr({"array": y_filt, "sampling_rate": sr})["text"]
72
+
73
+ # Plot
74
+ fig = plot_features(y_filt, sr, mfcc, delta, delta2)
75
+
76
+ return text, fig
77
+
78
+ # === Gradio UI ===
79
+
80
+ demo = gr.Interface(
81
+ fn=process,
82
+ inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"),
83
+ outputs=[
84
+ gr.Textbox(label="Transcription"),
85
+ gr.Plot(label="Features Visualization")
86
+ ],
87
+ title="🧠 Advanced Speech AI Demo with Wav2Vec2",
88
+ description="Upload or record audio → filters + MFCC+Δ+ΔΔ → transcription with Wav2Vec2 → visual plots"
89
+ )
90
+
91
+ if __name__ == "__main__":
92
+ demo.launch()