Spaces:

gaur3009
/

exp1

Sleeping

App Files Files Community

gaur3009 commited on Jul 9

Commit

2ec0359

verified ·

1 Parent(s): 9a1202a

Create app.py

Browse files

Files changed (1) hide show

app.py +92 -0

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# app.py
+import gradio as gr
+import numpy as np
+import librosa
+import librosa.display
+import matplotlib.pyplot as plt
+import scipy.signal
+from transformers import pipeline
+# Load ASR model once (Wav2Vec2 large)
+asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
+# === DSP & feature extraction functions ===
+def lowpass_filter(y, sr, cutoff=7000):
+    nyq = 0.5 * sr
+    norm_cutoff = cutoff / nyq
+    b, a = scipy.signal.butter(6, norm_cutoff, btype='low', analog=False)
+    y_filt = scipy.signal.lfilter(b, a, y)
+    return y_filt
+def compute_mfcc(y, sr, n_mfcc=13):
+    # Pre-emphasis
+    y = scipy.signal.lfilter([1, -0.97], 1, y)
+    # Frame & window params
+    hop_length = int(0.010 * sr)
+    win_length = int(0.025 * sr)
+    # Compute MFCC
+    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc,
+                                hop_length=hop_length, n_fft=512)
+    # Δ and ΔΔ
+    delta = librosa.feature.delta(mfcc)
+    delta2 = librosa.feature.delta(mfcc, order=2)
+    # Mean & variance normalization
+    mean = np.mean(mfcc, axis=1, keepdims=True)
+    std = np.std(mfcc, axis=1, keepdims=True)
+    mfcc_norm = (mfcc - mean) / (std + 1e-6)
+    return mfcc_norm, delta, delta2
+def plot_features(y, sr, mfcc, delta, delta2):
+    fig, axs = plt.subplots(4, 1, figsize=(10, 10))
+    # Waveform
+    axs[0].set(title="Waveform")
+    librosa.display.waveshow(y, sr=sr, ax=axs[0])
+    # Spectrogram
+    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=512, hop_length=256)
+    S_dB = librosa.power_to_db(S, ref=np.max)
+    librosa.display.specshow(S_dB, sr=sr, hop_length=256, x_axis='time', y_axis='mel', ax=axs[1])
+    axs[1].set(title="Mel Spectrogram")
+    # MFCC
+    librosa.display.specshow(mfcc, x_axis='time', ax=axs[2])
+    axs[2].set(title="MFCC (normalized)")
+    # Δ
+    librosa.display.specshow(delta, x_axis='time', ax=axs[3])
+    axs[3].set(title="Delta features")
+    plt.tight_layout()
+    return fig
+def process(audio):
+    sr, y = audio
+    y = y.astype(np.float32)
+    # Lowpass filter
+    y_filt = lowpass_filter(y, sr)
+    # Compute features
+    mfcc, delta, delta2 = compute_mfcc(y_filt, sr)
+    # Run ASR (Wav2Vec2)
+    text = asr({"array": y_filt, "sampling_rate": sr})["text"]
+    # Plot
+    fig = plot_features(y_filt, sr, mfcc, delta, delta2)
+    return text, fig
+# === Gradio UI ===
+demo = gr.Interface(
+    fn=process,
+    inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"),
+    outputs=[
+        gr.Textbox(label="Transcription"),
+        gr.Plot(label="Features Visualization")
+    ],
+    title="🧠 Advanced Speech AI Demo with Wav2Vec2",
+    description="Upload or record audio → filters + MFCC+Δ+ΔΔ → transcription with Wav2Vec2 → visual plots"
+)
+if __name__ == "__main__":
+    demo.launch()