Aditigo commited on
Commit
da1806c
Β·
verified Β·
1 Parent(s): b24597e

Added Gradio app.py

Browse files
Files changed (1) hide show
  1. app.py +230 -0
app.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ English Accent Detector - Analyzes speaker's accent from video URLs
4
+ """
5
+ from __future__ import annotations
6
+ import argparse
7
+ import random
8
+ import tempfile
9
+ from collections import Counter
10
+ from pathlib import Path
11
+ import time
12
+
13
+ import torch
14
+ import torchaudio
15
+ import gradio as gr
16
+ from speechbrain.inference.classifiers import EncoderClassifier
17
+ from yt_dlp import YoutubeDL
18
+ from huggingface_hub.utils import LocalEntryNotFoundError
19
+
20
+ # ─────────────── Model setup (with retry) ───────────────
21
+ ACCENT_MODEL_ID = "Jzuluaga/accent-id-commonaccent_ecapa"
22
+ LANG_MODEL_ID = "speechbrain/lang-id-voxlingua107-ecapa"
23
+ DEVICE = "cpu" # force CPU; Spaces' free tier has no GPU
24
+
25
+ def load_with_retry(model_id: str, tries: int = 5, backoff: int = 5):
26
+ """Download model weights with exponential-backoff retry."""
27
+ for attempt in range(1, tries + 1):
28
+ try:
29
+ return EncoderClassifier.from_hparams(
30
+ source=model_id,
31
+ run_opts={"device": DEVICE},
32
+ )
33
+ except LocalEntryNotFoundError:
34
+ if attempt == tries:
35
+ raise
36
+ wait = backoff * attempt
37
+ print(f"[{model_id}] download failed (try {attempt}/{tries}), retrying in {wait}s")
38
+ time.sleep(wait)
39
+
40
+ accent_clf = load_with_retry(ACCENT_MODEL_ID)
41
+ lang_clf = load_with_retry(LANG_MODEL_ID)
42
+
43
+ # ─────────────── Helpers ───────────────
44
+ def sec_to_hms(sec: int) -> str:
45
+ h = sec // 3600
46
+ m = (sec % 3600) // 60
47
+ s = sec % 60
48
+ return f"{h:02d}:{m:02d}:{s:02d}"
49
+
50
+ def download_audio(url: str, out_path: Path) -> Path:
51
+ opts = {
52
+ "format": "bestaudio/best",
53
+ "outtmpl": str(out_path.with_suffix(".%(ext)s")),
54
+ "postprocessors": [],
55
+ "quiet": True,
56
+ }
57
+ with YoutubeDL(opts) as ydl:
58
+ info = ydl.extract_info(url, download=True)
59
+ filename = ydl.prepare_filename(info)
60
+ return Path(filename)
61
+
62
+ def extract_wav(src: Path, dst: Path, start: int, dur: int = 8) -> None:
63
+ target_sr = 16000
64
+ offset = start * target_sr
65
+ frames = dur * target_sr
66
+ wav, orig_sr = torchaudio.load(str(src), frame_offset=offset, num_frames=frames)
67
+ if orig_sr != target_sr:
68
+ wav = torchaudio.transforms.Resample(orig_sr, target_sr)(wav)
69
+ torchaudio.save(str(dst), wav, target_sr, encoding="PCM_S", bits_per_sample=16)
70
+
71
+ def pick_random_offsets(total_s: int, n: int) -> list[int]:
72
+ max_start = total_s - 8
73
+ pool = list(range(max_start + 1))
74
+ if n > len(pool):
75
+ n = len(pool)
76
+ return random.sample(pool, n)
77
+
78
+ # ─────────────── Classification ───────────────
79
+ def classify_language(wav: Path) -> tuple[str, float]:
80
+ sig = lang_clf.load_audio(str(wav))
81
+ _, log_p, _, label = lang_clf.classify_batch(sig)
82
+ return label[0], float(log_p.exp().item()) * 100
83
+
84
+ def classify_accent(wav: Path) -> tuple[str, float]:
85
+ sig = accent_clf.load_audio(str(wav))
86
+ _, log_p, _, label = accent_clf.classify_batch(sig)
87
+ return label[0], float(log_p.item()) * 100
88
+
89
+ def calculate_english_confidence(lang: str, lang_conf: float, accent_conf: float) -> float:
90
+ if not lang.lower().startswith("en"):
91
+ return 0.0
92
+ english_score = (lang_conf * 0.7) + (accent_conf * 0.3)
93
+ return min(100.0, max(0.0, english_score))
94
+
95
+ # ─────────────── Core pipeline ───────────────
96
+ def analyse_accent(url: str, n_samples: int = 4) -> dict:
97
+ if not url:
98
+ return {"error": "Please provide a video URL."}
99
+ if n_samples < 1:
100
+ return {"error": "Number of samples must be at least 1."}
101
+
102
+ with tempfile.TemporaryDirectory() as td:
103
+ td = Path(td)
104
+ try:
105
+ # 1) Download audio
106
+ audio_file = download_audio(url, td / "audio")
107
+ info = torchaudio.info(str(audio_file))
108
+ total_s = int(info.num_frames / info.sample_rate)
109
+ if total_s < 8:
110
+ return {"error": "Audio shorter than 8 seconds."}
111
+
112
+ # 2) Language detection
113
+ mid_start = max(0, total_s // 2 - 4)
114
+ lang_wav = td / "lang_check.wav"
115
+ extract_wav(audio_file, lang_wav, start=mid_start)
116
+ lang, lang_conf = classify_language(lang_wav)
117
+ is_english = lang.lower().startswith("en")
118
+
119
+ if not is_english:
120
+ return {
121
+ "is_english_speaker": False,
122
+ "detected_language": lang,
123
+ "language_confidence": round(lang_conf, 1),
124
+ "accent_classification": "N/A",
125
+ "english_confidence_score": 0.0,
126
+ "summary": f"Non-English language detected: {lang} ({lang_conf:.1f}%)"
127
+ }
128
+
129
+ # 3) Accent analysis
130
+ offsets = pick_random_offsets(total_s, n_samples)
131
+ accent_results = []
132
+ for i, start in enumerate(sorted(offsets)):
133
+ clip_wav = td / f"clip_{i}.wav"
134
+ extract_wav(audio_file, clip_wav, start=start)
135
+ acc, conf = classify_accent(clip_wav)
136
+ accent_results.append({
137
+ "clip": i + 1,
138
+ "time_range": f"{sec_to_hms(start)} - {sec_to_hms(start + 8)}",
139
+ "accent": acc,
140
+ "confidence": round(conf, 1),
141
+ })
142
+
143
+ # 4) Aggregate results
144
+ labels = [r["accent"] for r in accent_results]
145
+ most_common_accent, count = Counter(labels).most_common(1)[0]
146
+ confs = [r["confidence"] for r in accent_results if r["accent"] == most_common_accent]
147
+ avg_conf = sum(confs) / len(confs)
148
+ eng_conf = calculate_english_confidence(lang, lang_conf, avg_conf)
149
+
150
+ return {
151
+ "is_english_speaker": True,
152
+ "detected_language": "English",
153
+ "language_confidence": round(lang_conf, 1),
154
+ "accent_classification": most_common_accent,
155
+ "accent_confidence": round(avg_conf, 1),
156
+ "english_confidence_score": round(eng_conf, 1),
157
+ "samples_analyzed": len(accent_results),
158
+ "consensus": f"{count}/{n_samples} samples",
159
+ "detailed_results": accent_results,
160
+ "summary": (
161
+ f"English speaker detected with {most_common_accent} accent "
162
+ f"(confidence: {eng_conf:.1f}%)"
163
+ )
164
+ }
165
+
166
+ except Exception as e:
167
+ return {"error": f"Processing failed: {e}"}
168
+
169
+ # ─────────────── Gradio UI ───────────────
170
+ def app():
171
+ with gr.Blocks(title="English Accent Detector") as demo:
172
+ gr.Markdown(
173
+ "# πŸŽ™οΈ English Accent Detector\n"
174
+ "**Analyze speaker's accent from video URLs**\n\n"
175
+ "This tool:\n"
176
+ "1. Accepts public video URLs (YouTube, Loom, direct MP4 links)\n"
177
+ "2. Extracts audio from the video\n"
178
+ "3. Analyzes if the speaker is an English language candidate\n"
179
+ "4. Classifies the accent type and provides confidence scores\n"
180
+ )
181
+
182
+ with gr.Row():
183
+ with gr.Column():
184
+ url_input = gr.Text(
185
+ label="Video URL",
186
+ placeholder="Enter public video URL (YouTube, Loom, etc.)",
187
+ lines=1
188
+ )
189
+ samples_input = gr.Slider(
190
+ minimum=1,
191
+ maximum=10,
192
+ value=4,
193
+ step=1,
194
+ label="Number of audio samples to analyze",
195
+ info="More samples = more accurate but slower"
196
+ )
197
+ analyze_btn = gr.Button("πŸ” Analyze Accent", variant="primary")
198
+
199
+ with gr.Column():
200
+ result_output = gr.JSON(label="Analysis Results")
201
+
202
+ gr.Markdown("### Example URLs to try:")
203
+ gr.Examples(
204
+ examples=[
205
+ ["https://www.youtube.com/watch?v=dQw4w9WgXcQ", 4],
206
+ ["https://www.youtube.com/shorts/VO6n9GTzSqU", 4],
207
+ ],
208
+ inputs=[url_input, samples_input],
209
+ label="Click to load example"
210
+ )
211
+
212
+ analyze_btn.click(
213
+ fn=analyse_accent,
214
+ inputs=[url_input, samples_input],
215
+ outputs=result_output
216
+ )
217
+
218
+ return demo
219
+
220
+ if __name__ == "__main__":
221
+ parser = argparse.ArgumentParser(description="English Accent Detector")
222
+ parser.add_argument(
223
+ "--port", type=int, default=7860,
224
+ help="Port to run the server on"
225
+ )
226
+ args = parser.parse_args()
227
+
228
+ demo = app()
229
+ # On Hugging Face Spaces, a public URL is provided automatically
230
+ demo.launch(server_port=args.port)