Spaces:
Running
Running
File size: 2,549 Bytes
9e5fc19 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import streamlit as st
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import wave
import json
from vosk import Model, KaldiRecognizer
from transformers import pipeline
import os
from pydub import AudioSegment
import noisereduce as nr
# Load Vosk model
MODEL_PATH = "vosk-model-small-en-us-0.15"
if not os.path.exists(MODEL_PATH):
st.error("Vosk model not found! Please download and extract it.")
st.stop()
model = Model(MODEL_PATH)
# Streamlit UI
st.title("ποΈ Speech Detection System using Mozilla Common Voice")
st.write("Upload an audio file and get real-time speech-to-text, noise filtering, and emotion analysis.")
uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])
if uploaded_file:
# Convert MP3 to WAV if needed
file_path = f"temp/{uploaded_file.name}"
os.makedirs("temp", exist_ok=True)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
if file_path.endswith(".mp3"):
wav_path = file_path.replace(".mp3", ".wav")
audio = AudioSegment.from_mp3(file_path)
audio.export(wav_path, format="wav")
file_path = wav_path
# Load audio
y, sr = librosa.load(file_path, sr=16000)
# Display waveform
fig, ax = plt.subplots(figsize=(10, 4))
librosa.display.waveshow(y, sr=sr, ax=ax)
st.pyplot(fig)
# Noise Reduction
y_denoised = nr.reduce_noise(y=y, sr=sr)
denoised_path = file_path.replace(".wav", "_denoised.wav")
sf.write(denoised_path, y_denoised, sr)
# Speech-to-Text using Vosk
def transcribe_audio(audio_path):
wf = wave.open(audio_path, "rb")
rec = KaldiRecognizer(model, wf.getframerate())
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
return result["text"]
transcription = transcribe_audio(file_path)
st.subheader("π Transcribed Text:")
st.write(transcription)
# Emotion Detection
emotion_model = pipeline("audio-classification", model="superb/wav2vec2-large-xlsr-53")
emotion_result = emotion_model(file_path)
st.subheader("π Emotion Analysis:")
st.write(emotion_result)
# Play original and denoised audio
st.audio(file_path, format="audio/wav", start_time=0)
st.subheader("π Denoised Audio:")
st.audio(denoised_path, format="audio/wav", start_time=0)
|