File size: 2,549 Bytes
9e5fc19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import streamlit as st
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import wave
import json
from vosk import Model, KaldiRecognizer
from transformers import pipeline
import os
from pydub import AudioSegment
import noisereduce as nr

# Load Vosk model
MODEL_PATH = "vosk-model-small-en-us-0.15"
if not os.path.exists(MODEL_PATH):
    st.error("Vosk model not found! Please download and extract it.")
    st.stop()
model = Model(MODEL_PATH)

# Streamlit UI
st.title("πŸŽ™οΈ Speech Detection System using Mozilla Common Voice")
st.write("Upload an audio file and get real-time speech-to-text, noise filtering, and emotion analysis.")

uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])

if uploaded_file:
    # Convert MP3 to WAV if needed
    file_path = f"temp/{uploaded_file.name}"
    os.makedirs("temp", exist_ok=True)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    if file_path.endswith(".mp3"):
        wav_path = file_path.replace(".mp3", ".wav")
        audio = AudioSegment.from_mp3(file_path)
        audio.export(wav_path, format="wav")
        file_path = wav_path

    # Load audio
    y, sr = librosa.load(file_path, sr=16000)

    # Display waveform
    fig, ax = plt.subplots(figsize=(10, 4))
    librosa.display.waveshow(y, sr=sr, ax=ax)
    st.pyplot(fig)

    # Noise Reduction
    y_denoised = nr.reduce_noise(y=y, sr=sr)
    denoised_path = file_path.replace(".wav", "_denoised.wav")
    sf.write(denoised_path, y_denoised, sr)

    # Speech-to-Text using Vosk
    def transcribe_audio(audio_path):
        wf = wave.open(audio_path, "rb")
        rec = KaldiRecognizer(model, wf.getframerate())

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                result = json.loads(rec.Result())
                return result["text"]

    transcription = transcribe_audio(file_path)
    st.subheader("πŸ“ Transcribed Text:")
    st.write(transcription)

    # Emotion Detection
    emotion_model = pipeline("audio-classification", model="superb/wav2vec2-large-xlsr-53")
    emotion_result = emotion_model(file_path)

    st.subheader("😊 Emotion Analysis:")
    st.write(emotion_result)

    # Play original and denoised audio
    st.audio(file_path, format="audio/wav", start_time=0)
    st.subheader("πŸ”Š Denoised Audio:")
    st.audio(denoised_path, format="audio/wav", start_time=0)