Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
+
import librosa
|
6 |
+
import librosa.display
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import scipy.signal
|
9 |
+
from transformers import pipeline
|
10 |
+
|
11 |
+
# Load ASR model once (Wav2Vec2 large)
|
12 |
+
asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
|
13 |
+
|
14 |
+
# === DSP & feature extraction functions ===
|
15 |
+
|
16 |
+
def lowpass_filter(y, sr, cutoff=7000):
|
17 |
+
nyq = 0.5 * sr
|
18 |
+
norm_cutoff = cutoff / nyq
|
19 |
+
b, a = scipy.signal.butter(6, norm_cutoff, btype='low', analog=False)
|
20 |
+
y_filt = scipy.signal.lfilter(b, a, y)
|
21 |
+
return y_filt
|
22 |
+
|
23 |
+
def compute_mfcc(y, sr, n_mfcc=13):
|
24 |
+
# Pre-emphasis
|
25 |
+
y = scipy.signal.lfilter([1, -0.97], 1, y)
|
26 |
+
# Frame & window params
|
27 |
+
hop_length = int(0.010 * sr)
|
28 |
+
win_length = int(0.025 * sr)
|
29 |
+
# Compute MFCC
|
30 |
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc,
|
31 |
+
hop_length=hop_length, n_fft=512)
|
32 |
+
# Δ and ΔΔ
|
33 |
+
delta = librosa.feature.delta(mfcc)
|
34 |
+
delta2 = librosa.feature.delta(mfcc, order=2)
|
35 |
+
# Mean & variance normalization
|
36 |
+
mean = np.mean(mfcc, axis=1, keepdims=True)
|
37 |
+
std = np.std(mfcc, axis=1, keepdims=True)
|
38 |
+
mfcc_norm = (mfcc - mean) / (std + 1e-6)
|
39 |
+
return mfcc_norm, delta, delta2
|
40 |
+
|
41 |
+
def plot_features(y, sr, mfcc, delta, delta2):
|
42 |
+
fig, axs = plt.subplots(4, 1, figsize=(10, 10))
|
43 |
+
# Waveform
|
44 |
+
axs[0].set(title="Waveform")
|
45 |
+
librosa.display.waveshow(y, sr=sr, ax=axs[0])
|
46 |
+
# Spectrogram
|
47 |
+
S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=512, hop_length=256)
|
48 |
+
S_dB = librosa.power_to_db(S, ref=np.max)
|
49 |
+
librosa.display.specshow(S_dB, sr=sr, hop_length=256, x_axis='time', y_axis='mel', ax=axs[1])
|
50 |
+
axs[1].set(title="Mel Spectrogram")
|
51 |
+
# MFCC
|
52 |
+
librosa.display.specshow(mfcc, x_axis='time', ax=axs[2])
|
53 |
+
axs[2].set(title="MFCC (normalized)")
|
54 |
+
# Δ
|
55 |
+
librosa.display.specshow(delta, x_axis='time', ax=axs[3])
|
56 |
+
axs[3].set(title="Delta features")
|
57 |
+
plt.tight_layout()
|
58 |
+
return fig
|
59 |
+
|
60 |
+
def process(audio):
|
61 |
+
sr, y = audio
|
62 |
+
y = y.astype(np.float32)
|
63 |
+
|
64 |
+
# Lowpass filter
|
65 |
+
y_filt = lowpass_filter(y, sr)
|
66 |
+
|
67 |
+
# Compute features
|
68 |
+
mfcc, delta, delta2 = compute_mfcc(y_filt, sr)
|
69 |
+
|
70 |
+
# Run ASR (Wav2Vec2)
|
71 |
+
text = asr({"array": y_filt, "sampling_rate": sr})["text"]
|
72 |
+
|
73 |
+
# Plot
|
74 |
+
fig = plot_features(y_filt, sr, mfcc, delta, delta2)
|
75 |
+
|
76 |
+
return text, fig
|
77 |
+
|
78 |
+
# === Gradio UI ===
|
79 |
+
|
80 |
+
demo = gr.Interface(
|
81 |
+
fn=process,
|
82 |
+
inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"),
|
83 |
+
outputs=[
|
84 |
+
gr.Textbox(label="Transcription"),
|
85 |
+
gr.Plot(label="Features Visualization")
|
86 |
+
],
|
87 |
+
title="🧠 Advanced Speech AI Demo with Wav2Vec2",
|
88 |
+
description="Upload or record audio → filters + MFCC+Δ+ΔΔ → transcription with Wav2Vec2 → visual plots"
|
89 |
+
)
|
90 |
+
|
91 |
+
if __name__ == "__main__":
|
92 |
+
demo.launch()
|