Spaces:

ojas121
/

speech_emotion_project

Running

File size: 4,587 Bytes

a9f8ee6
 
9e5fc19
 
 
 
 
 
 
 
 
 
3867db1
9e5fc19
 
 
3867db1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656615f
a9f8ee6
 
 
3867db1
a9f8ee6
 
 
2ee5365
a9f8ee6
 
2ee5365
a9f8ee6
656615f
a9f8ee6
3867db1
a9f8ee6
2ee5365
a9f8ee6
 
9e5fc19
a9f8ee6
3867db1
 
9e5fc19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3867db1
 
9e5fc19
 
 
 
a9f8ee6
3867db1
9e5fc19
 
 
 
a9f8ee6
9e5fc19
 
 
 
 
 
 
 
 
 
 
 
 
3867db1
 
 
9e5fc19
a9f8ee6
3867db1
9e5fc19
 
 
a9f8ee6
3867db1
9e5fc19
3867db1
 
9e5fc19

import os
import subprocess
import streamlit as st
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import wave
import json
from vosk import Model, KaldiRecognizer
from transformers import pipeline
from huggingface_hub import snapshot_download
from pydub import AudioSegment
import noisereduce as nr

# 🎨 Apply Custom CSS Styling
st.markdown(
    """
    <style>
        .stApp {
            background-color: #f0f2f6;
        }
        .title {
            font-size: 32px;
            text-align: center;
            color: #4A90E2;
            font-weight: bold;
        }
        .subheader {
            font-size: 20px;
            font-weight: bold;
            color: #333;
        }
        .stButton>button {
            background-color: #4A90E2 !important;
            color: white !important;
            font-size: 18px !important;
            padding: 10px 24px !important;
            border-radius: 10px !important;
            border: none !important;
        }
        .stAudio {
            width: 100% !important;
        }
        .stMarkdown {
            font-size: 16px;
            color: #333;
        }
    </style>
    """,
    unsafe_allow_html=True
)

# ✅ Auto-Download Vosk Model (Speech-to-Text)
VOSK_MODEL = "vosk-model-small-en-us-0.15"
if not os.path.exists(VOSK_MODEL):
    st.write("📥 Downloading Vosk Model...")
    subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"])
    subprocess.run(["unzip", "vosk.zip"])
    subprocess.run(["rm", "vosk.zip"])

# Load Vosk model
model = Model(VOSK_MODEL)

# ✅ Auto-Download Wav2Vec2 Model (Emotion Detection)
WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53"
if not os.path.exists(WAV2VEC_MODEL):
    st.write(f"📥 Downloading {WAV2VEC_MODEL}...")
    snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL)

# Load emotion detection model
emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL)

# ✅ Streamlit UI
st.markdown("<div class='title'>🎙️ Speech Detection System</div>", unsafe_allow_html=True)
st.markdown("<div class='subheader'>🔍 Upload an audio file for speech-to-text, noise filtering, and emotion analysis.</div>", unsafe_allow_html=True)

uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])

if uploaded_file:
    # Convert MP3 to WAV if needed
    file_path = f"temp/{uploaded_file.name}"
    os.makedirs("temp", exist_ok=True)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    if file_path.endswith(".mp3"):
        wav_path = file_path.replace(".mp3", ".wav")
        audio = AudioSegment.from_mp3(file_path)
        audio.export(wav_path, format="wav")
        file_path = wav_path

    # Load audio
    y, sr = librosa.load(file_path, sr=16000)

    # 🎵 Display waveform
    st.markdown("<div class='subheader'>🎼 Audio Waveform:</div>", unsafe_allow_html=True)
    fig, ax = plt.subplots(figsize=(10, 4))
    librosa.display.waveshow(y, sr=sr, ax=ax)
    st.pyplot(fig)

    # ✅ Noise Reduction
    st.markdown("<div class='subheader'>🔇 Applying Noise Reduction...</div>", unsafe_allow_html=True)
    y_denoised = nr.reduce_noise(y=y, sr=sr)
    denoised_path = file_path.replace(".wav", "_denoised.wav")
    sf.write(denoised_path, y_denoised, sr)

    # ✅ Speech-to-Text using Vosk
    def transcribe_audio(audio_path):
        wf = wave.open(audio_path, "rb")
        rec = KaldiRecognizer(model, wf.getframerate())

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                result = json.loads(rec.Result())
                return result["text"]

    transcription = transcribe_audio(file_path)

    st.markdown("<div class='subheader'>📝 Transcribed Text:</div>", unsafe_allow_html=True)
    st.markdown(f"<div class='stMarkdown'>{transcription}</div>", unsafe_allow_html=True)

    # ✅ Emotion Detection
    st.markdown("<div class='subheader'>😊 Emotion Analysis:</div>", unsafe_allow_html=True)
    emotion_result = emotion_model(file_path)
    st.write(emotion_result)

    # ✅ Play Original & Denoised Audio
    st.markdown("<div class='subheader'>🔊 Play Audio:</div>", unsafe_allow_html=True)
    st.audio(file_path, format="audio/wav", start_time=0)

    st.markdown("<div class='subheader'>🔇 Denoised Audio:</div>", unsafe_allow_html=True)
    st.audio(denoised_path, format="audio/wav", start_time=0)