Spaces:

youssefga28
/

English-Accent-Classifier

Sleeping

File size: 3,322 Bytes

import os
import requests
import tempfile
import gradio as gr
from moviepy import VideoFileClip
from speechbrain.inference.interfaces import foreign_class
import whisper
from together import Together

# Initialize Whisper once
_whisper_model = whisper.load_model("base")

# Initialize SpeechBrain classifier once
_classifier = foreign_class(
    source="warisqr7/accent-id-commonaccent_xlsr-en-english",
    pymodule_file="custom_interface.py",
    classname="CustomEncoderWav2vec2Classifier"
)

# Helper to download direct‐mp4 URL to a temp file
def download_video(url: str) -> str:
    resp = requests.get(url, stream=True)
    resp.raise_for_status()
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
    for chunk in resp.iter_content(8192):
        tmp.write(chunk)
    tmp.close()
    return tmp.name

# Helper to extract audio to a temp file
def extract_audio(video_path: str) -> str:
    tmp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(tmp_audio, logger=None)
    clip.close()
    return tmp_audio

# Main pipeline
def analyze_url(video_url):
    try:
        # 1. Download & extract
        vid = download_video(video_url)
        aud = extract_audio(vid)

        # 2. Accent classification
        out_prob, score, idx, lab = _classifier.classify_file(aud)
        accent = lab[0]
        conf_pct = round(float(score) * 100, 2)

        # 3. Transcription
        result = _whisper_model.transcribe(aud)
        transcript = result["text"]

        # 4. LLM analysis
        api_key = os.getenv('API_KEY')
        client = Together(api_key=api_key)
        prompt = f"""
You are an English-speaking coach. Given this transcript of a spoken English audio with an {accent} accent and classification confidence {conf_pct}%:
\"\"\"{transcript}\"\"\"

Evaluate how confident the speaker sounds for a job interview based on fluency, clarity, filler usage, professional English, and pacing.
Provide:
- A proficiency score between 0 and 100
- A brief explanation
- Give Bullet points, but nothing in bold.
"""
        resp = client.chat.completions.create(
            model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
            messages=[{"role": "user", "content": prompt}]
        )
        analysis = resp.choices[0].message.content.strip()

        # Clean up temp files
        os.remove(vid)
        os.remove(aud)

        return accent, f"{conf_pct}%", transcript, analysis

    except Exception as e:
        return "Error", "", "", str(e)

# Build Gradio interface
with gr.Blocks(title="English Accent & Proficiency Analyzer") as demo:
    gr.Markdown("## 🎙️ English Accent Detection & Proficiency Analysis")
    with gr.Row():
        inp = gr.Textbox(label="Direct MP4 Video URL", placeholder="https://...")
        run = gr.Button("Analyze")
    with gr.Row():
        out1 = gr.Textbox(label="Detected Accent")
        out2 = gr.Textbox(label="Accent Classification Confidence Score")
    out3 = gr.Textbox(label="Transcript", lines=5)
    out4 = gr.Textbox(label="Proficiency Analysis", lines=10)

    run.click(
        fn=analyze_url,
        inputs=inp,
        outputs=[out1, out2, out3, out4],
        api_name="analyze"
    )

if __name__ == "__main__":
    demo.launch()