Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| """ | |
| English Accent Detector - Analyzes speaker's accent from video URLs | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import random | |
| import tempfile | |
| from collections import Counter | |
| from pathlib import Path | |
| import time | |
| import torch | |
| import torchaudio | |
| import gradio as gr | |
| from speechbrain.inference.classifiers import EncoderClassifier | |
| from yt_dlp import YoutubeDL | |
| from huggingface_hub.utils import LocalEntryNotFoundError | |
| # βββββββββββββββ Model setup (with retry) βββββββββββββββ | |
| ACCENT_MODEL_ID = "Jzuluaga/accent-id-commonaccent_ecapa" | |
| LANG_MODEL_ID = "speechbrain/lang-id-voxlingua107-ecapa" | |
| DEVICE = "cpu" # force CPU; Spaces' free tier has no GPU | |
| def load_with_retry(model_id: str, tries: int = 5, backoff: int = 5): | |
| """Download model weights with exponential-backoff retry.""" | |
| for attempt in range(1, tries + 1): | |
| try: | |
| return EncoderClassifier.from_hparams( | |
| source=model_id, | |
| run_opts={"device": DEVICE}, | |
| ) | |
| except LocalEntryNotFoundError: | |
| if attempt == tries: | |
| raise | |
| wait = backoff * attempt | |
| print(f"[{model_id}] download failed (try {attempt}/{tries}), retrying in {wait}s") | |
| time.sleep(wait) | |
| accent_clf = load_with_retry(ACCENT_MODEL_ID) | |
| lang_clf = load_with_retry(LANG_MODEL_ID) | |
| # βββββββββββββββ Helpers βββββββββββββββ | |
| def sec_to_hms(sec: int) -> str: | |
| h = sec // 3600 | |
| m = (sec % 3600) // 60 | |
| s = sec % 60 | |
| return f"{h:02d}:{m:02d}:{s:02d}" | |
| def download_audio(url: str, out_path: Path) -> Path: | |
| opts = { | |
| "format": "bestaudio/best", | |
| "outtmpl": str(out_path.with_suffix(".%(ext)s")), | |
| "postprocessors": [], | |
| "quiet": True, | |
| } | |
| with YoutubeDL(opts) as ydl: | |
| info = ydl.extract_info(url, download=True) | |
| filename = ydl.prepare_filename(info) | |
| return Path(filename) | |
| def extract_wav(src: Path, dst: Path, start: int, dur: int = 8) -> None: | |
| target_sr = 16000 | |
| offset = start * target_sr | |
| frames = dur * target_sr | |
| wav, orig_sr = torchaudio.load(str(src), frame_offset=offset, num_frames=frames) | |
| if orig_sr != target_sr: | |
| wav = torchaudio.transforms.Resample(orig_sr, target_sr)(wav) | |
| torchaudio.save(str(dst), wav, target_sr, encoding="PCM_S", bits_per_sample=16) | |
| def pick_random_offsets(total_s: int, n: int) -> list[int]: | |
| max_start = total_s - 8 | |
| pool = list(range(max_start + 1)) | |
| if n > len(pool): | |
| n = len(pool) | |
| return random.sample(pool, n) | |
| # βββββββββββββββ Classification βββββββββββββββ | |
| def classify_language(wav: Path) -> tuple[str, float]: | |
| sig = lang_clf.load_audio(str(wav)) | |
| _, log_p, _, label = lang_clf.classify_batch(sig) | |
| return label[0], float(log_p.exp().item()) * 100 | |
| def classify_accent(wav: Path) -> tuple[str, float]: | |
| sig = accent_clf.load_audio(str(wav)) | |
| _, log_p, _, label = accent_clf.classify_batch(sig) | |
| return label[0], float(log_p.item()) * 100 | |
| def calculate_english_confidence(lang: str, lang_conf: float, accent_conf: float) -> float: | |
| if not lang.lower().startswith("en"): | |
| return 0.0 | |
| english_score = (lang_conf * 0.7) + (accent_conf * 0.3) | |
| return min(100.0, max(0.0, english_score)) | |
| # βββββββββββββββ Core pipeline βββββββββββββββ | |
| def analyse_accent(url: str, n_samples: int = 4) -> dict: | |
| if not url: | |
| return {"error": "Please provide a video URL."} | |
| if n_samples < 1: | |
| return {"error": "Number of samples must be at least 1."} | |
| with tempfile.TemporaryDirectory() as td: | |
| td = Path(td) | |
| try: | |
| # 1) Download audio | |
| audio_file = download_audio(url, td / "audio") | |
| info = torchaudio.info(str(audio_file)) | |
| total_s = int(info.num_frames / info.sample_rate) | |
| if total_s < 8: | |
| return {"error": "Audio shorter than 8 seconds."} | |
| # 2) Language detection | |
| mid_start = max(0, total_s // 2 - 4) | |
| lang_wav = td / "lang_check.wav" | |
| extract_wav(audio_file, lang_wav, start=mid_start) | |
| lang, lang_conf = classify_language(lang_wav) | |
| is_english = lang.lower().startswith("en") | |
| if not is_english: | |
| return { | |
| "is_english_speaker": False, | |
| "detected_language": lang, | |
| "language_confidence": round(lang_conf, 1), | |
| "accent_classification": "N/A", | |
| "english_confidence_score": 0.0, | |
| "summary": f"Non-English language detected: {lang} ({lang_conf:.1f}%)" | |
| } | |
| # 3) Accent analysis | |
| offsets = pick_random_offsets(total_s, n_samples) | |
| accent_results = [] | |
| for i, start in enumerate(sorted(offsets)): | |
| clip_wav = td / f"clip_{i}.wav" | |
| extract_wav(audio_file, clip_wav, start=start) | |
| acc, conf = classify_accent(clip_wav) | |
| accent_results.append({ | |
| "clip": i + 1, | |
| "time_range": f"{sec_to_hms(start)} - {sec_to_hms(start + 8)}", | |
| "accent": acc, | |
| "confidence": round(conf, 1), | |
| }) | |
| # 4) Aggregate results | |
| labels = [r["accent"] for r in accent_results] | |
| most_common_accent, count = Counter(labels).most_common(1)[0] | |
| confs = [r["confidence"] for r in accent_results if r["accent"] == most_common_accent] | |
| avg_conf = sum(confs) / len(confs) | |
| eng_conf = calculate_english_confidence(lang, lang_conf, avg_conf) | |
| return { | |
| "is_english_speaker": True, | |
| "detected_language": "English", | |
| "language_confidence": round(lang_conf, 1), | |
| "accent_classification": most_common_accent, | |
| "accent_confidence": round(avg_conf, 1), | |
| "english_confidence_score": round(eng_conf, 1), | |
| "samples_analyzed": len(accent_results), | |
| "consensus": f"{count}/{n_samples} samples", | |
| "detailed_results": accent_results, | |
| "summary": ( | |
| f"English speaker detected with {most_common_accent} accent " | |
| f"(confidence: {eng_conf:.1f}%)" | |
| ) | |
| } | |
| except Exception as e: | |
| return {"error": f"Processing failed: {e}"} | |
| # βββββββββββββββ Gradio UI βββββββββββββββ | |
| def app(): | |
| with gr.Blocks(title="English Accent Detector") as demo: | |
| gr.Markdown( | |
| "# ποΈ English Accent Detector\n" | |
| "**Analyze speaker's accent from video URLs**\n\n" | |
| "This tool:\n" | |
| "1. Accepts public video URLs (YouTube, Loom, direct MP4 links)\n" | |
| "2. Extracts audio from the video\n" | |
| "3. Analyzes if the speaker is an English language candidate\n" | |
| "4. Classifies the accent type and provides confidence scores\n" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| url_input = gr.Text( | |
| label="Video URL", | |
| placeholder="Enter public video URL (YouTube, Loom, etc.)", | |
| lines=1 | |
| ) | |
| samples_input = gr.Slider( | |
| minimum=1, | |
| maximum=10, | |
| value=4, | |
| step=1, | |
| label="Number of audio samples to analyze", | |
| info="More samples = more accurate but slower" | |
| ) | |
| analyze_btn = gr.Button("π Analyze Accent", variant="primary") | |
| with gr.Column(): | |
| result_output = gr.JSON(label="Analysis Results") | |
| gr.Markdown("### Example URLs to try:") | |
| gr.Examples( | |
| examples=[ | |
| ["https://www.youtube.com/watch?v=dQw4w9WgXcQ", 4], | |
| ["https://www.youtube.com/shorts/VO6n9GTzSqU", 4], | |
| ], | |
| inputs=[url_input, samples_input], | |
| label="Click to load example" | |
| ) | |
| analyze_btn.click( | |
| fn=analyse_accent, | |
| inputs=[url_input, samples_input], | |
| outputs=result_output | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="English Accent Detector") | |
| parser.add_argument( | |
| "--port", type=int, default=7860, | |
| help="Port to run the server on" | |
| ) | |
| args = parser.parse_args() | |
| demo = app() | |
| # On Hugging Face Spaces, a public URL is provided automatically | |
| demo.launch(server_port=args.port) | |