aish31 commited on
Commit
ce70a28
Β·
1 Parent(s): acfc86d

Add application file

Browse files
Files changed (3) hide show
  1. README.md +27 -6
  2. app.py +253 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,13 +1,34 @@
1
  ---
2
- title: AccentDetect
3
- emoji: 🐠
4
- colorFrom: gray
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.31.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: 'tool for detecting and classifying English accents '
11
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: English Accent Detector
3
+ emoji: πŸŽ™οΈ
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.31.0
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Analyze speaker's accent from video URLs with English detection
11
  ---
12
+ # English Accent Detector
13
+
14
+ A Gradio-based tool for detecting and classifying English accents from public video URLs (e.g., YouTube, Loom). It first determines if the speaker is speaking English, then analyzes accent patterns with confidence scores for American, British, Australian, and other English variants.
15
+
16
+ ---
17
+
18
+ ## πŸš€ Features
19
+
20
+ * **Public URL support**: Download audio from YouTube, Loom, or direct MP4 links via `yt_dlp`.
21
+ * **English language detection**: Uses SpeechBrain's language-ID model to filter for English speakers.
22
+ * **Random-slice sampling**: Analyzes multiple random 8-second windows for robust classification.
23
+ * **Accent classification**: Classifies each slice using a pretrained ECAPA model and aggregates results.
24
+ * **Comprehensive confidence scoring**: Returns English detection confidence and accent classification scores.
25
+ * **Interactive UI**: Simple Gradio interfaceβ€”paste URL, choose sample count, click *Analyze Accent*.
26
+
27
+ ---
28
+
29
+ ## πŸ› οΈ Requirements
30
+
31
+ * Python 3.8+ (tested on 3.10)
32
+ * Dependencies listed in `requirements.txt`
33
+
34
 
 
app.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ English Accent Detector - Analyzes speaker's accent from video URLs
4
+ """
5
+ from __future__ import annotations
6
+ import argparse, random, tempfile
7
+ from collections import Counter
8
+ from pathlib import Path
9
+
10
+ import torch
11
+ import torchaudio
12
+ import gradio as gr
13
+ from speechbrain.inference.classifiers import EncoderClassifier
14
+ from yt_dlp import YoutubeDL
15
+
16
+ # ─────────────── Model setup ───────────────
17
+ ACCENT_MODEL_ID = "Jzuluaga/accent-id-commonaccent_ecapa"
18
+ LANG_MODEL_ID = "speechbrain/lang-id-voxlingua107-ecapa"
19
+
20
+ # Force CPU
21
+ DEVICE = "cpu"
22
+
23
+ accent_clf = EncoderClassifier.from_hparams(
24
+ source=ACCENT_MODEL_ID,
25
+ run_opts={"device": DEVICE}
26
+ )
27
+ lang_clf = EncoderClassifier.from_hparams(
28
+ source=LANG_MODEL_ID,
29
+ run_opts={"device": DEVICE}
30
+ )
31
+
32
+ # ─────────────── Helpers ───────────────
33
+ def sec_to_hms(sec: int) -> str:
34
+ h = sec // 3600
35
+ m = (sec % 3600) // 60
36
+ s = sec % 60
37
+ return f"{h:02d}:{m:02d}:{s:02d}"
38
+
39
+ def download_audio(url: str, out_path: Path) -> Path:
40
+ """
41
+ Download best audio only via yt_dlp Python API.
42
+ Returns the actual file saved (could be .m4a, .webm, etc.).
43
+ """
44
+ opts = {
45
+ "format": "bestaudio/best",
46
+ "outtmpl": str(out_path.with_suffix(".%(ext)s")),
47
+ "postprocessors": [],
48
+ "quiet": True,
49
+ }
50
+ with YoutubeDL(opts) as ydl:
51
+ info = ydl.extract_info(url, download=True)
52
+ filename = ydl.prepare_filename(info)
53
+ return Path(filename)
54
+
55
+ def extract_wav(src: Path, dst: Path, start: int, dur: int = 8) -> None:
56
+ target_sr = 16000
57
+ offset = start * target_sr
58
+ frames = dur * target_sr
59
+
60
+ wav, orig_sr = torchaudio.load(str(src),
61
+ frame_offset=offset,
62
+ num_frames=frames)
63
+ if orig_sr != target_sr:
64
+ wav = torchaudio.transforms.Resample(orig_sr, target_sr)(wav)
65
+
66
+ torchaudio.save(str(dst), wav, target_sr,
67
+ encoding="PCM_S", bits_per_sample=16)
68
+
69
+ def pick_random_offsets(total_s: int, n: int) -> list[int]:
70
+ max_start = total_s - 8
71
+ pool = list(range(max_start + 1))
72
+ if n > len(pool):
73
+ n = len(pool)
74
+ return random.sample(pool, n)
75
+
76
+ # ─────────────── Classification ───────────────
77
+ def classify_language(wav: Path) -> tuple[str, float]:
78
+ sig = lang_clf.load_audio(str(wav))
79
+ _, log_p, _, label = lang_clf.classify_batch(sig)
80
+ return label[0], float(log_p.exp().item()) * 100
81
+
82
+ def classify_accent(wav: Path) -> tuple[str, float]:
83
+ sig = accent_clf.load_audio(str(wav))
84
+ _, log_p, _, label = accent_clf.classify_batch(sig)
85
+ return label[0], float(log_p.item()) * 100
86
+
87
+ def calculate_english_confidence(lang: str, lang_conf: float, accent_conf: float) -> float:
88
+ """
89
+ Calculate overall English accent confidence score (0-100%)
90
+ """
91
+ if not lang.lower().startswith("en"):
92
+ return 0.0
93
+
94
+ # Combine language confidence and accent confidence
95
+ # Weight language detection more heavily as it's the primary filter
96
+ english_score = (lang_conf * 0.7) + (accent_conf * 0.3)
97
+ return min(100.0, max(0.0, english_score))
98
+
99
+ # ─────────────── Core pipeline ───────────────
100
+ def analyse_accent(url: str, n_samples: int = 4) -> dict:
101
+ """
102
+ Main function to analyze accent from video URL
103
+ """
104
+ if not url:
105
+ return {"error": "Please provide a video URL."}
106
+ if n_samples < 1:
107
+ return {"error": "Number of samples must be at least 1."}
108
+
109
+ with tempfile.TemporaryDirectory() as td:
110
+ td = Path(td)
111
+
112
+ try:
113
+ # 1) Download audio from video
114
+ audio_file = td / "audio"
115
+ audio_file = download_audio(url, audio_file)
116
+
117
+ # 2) Read metadata for total seconds
118
+ info = torchaudio.info(str(audio_file))
119
+ total_s = int(info.num_frames / info.sample_rate)
120
+ if total_s < 8:
121
+ return {"error": "Audio shorter than 8 seconds."}
122
+
123
+ # 3) Language detection on middle slice
124
+ mid_start = max(0, total_s // 2 - 4)
125
+ lang_wav = td / "lang_check.wav"
126
+ extract_wav(audio_file, lang_wav, start=mid_start)
127
+ lang, lang_conf = classify_language(lang_wav)
128
+
129
+ # 4) Check if English is detected
130
+ is_english = lang.lower().startswith("en")
131
+
132
+ if not is_english:
133
+ return {
134
+ "is_english_speaker": False,
135
+ "detected_language": lang,
136
+ "language_confidence": round(lang_conf, 1),
137
+ "accent_classification": "N/A",
138
+ "english_confidence_score": 0.0,
139
+ "summary": f"Non-English language detected: {lang} ({lang_conf:.1f}%)"
140
+ }
141
+
142
+ # 5) Accent analysis on multiple random slices
143
+ offsets = pick_random_offsets(total_s, n_samples)
144
+ accent_results = []
145
+
146
+ for i, start in enumerate(sorted(offsets)):
147
+ clip_wav = td / f"clip_{i}.wav"
148
+ extract_wav(audio_file, clip_wav, start=start)
149
+ acc, conf = classify_accent(clip_wav)
150
+ accent_results.append({
151
+ "clip": i + 1,
152
+ "time_range": f"{sec_to_hms(start)} - {sec_to_hms(start + 8)}",
153
+ "accent": acc,
154
+ "confidence": round(conf, 1),
155
+ })
156
+
157
+ # 6) Determine overall accent classification
158
+ accent_labels = [r["accent"] for r in accent_results]
159
+ accent_counter = Counter(accent_labels)
160
+ most_common_accent, accent_count = accent_counter.most_common(1)[0]
161
+
162
+ # Calculate average confidence for the most common accent
163
+ matching_confidences = [r["confidence"] for r in accent_results
164
+ if r["accent"] == most_common_accent]
165
+ avg_accent_conf = sum(matching_confidences) / len(matching_confidences)
166
+
167
+ # Calculate overall English confidence score
168
+ english_confidence = calculate_english_confidence(lang, lang_conf, avg_accent_conf)
169
+
170
+ return {
171
+ "is_english_speaker": True,
172
+ "detected_language": "English",
173
+ "language_confidence": round(lang_conf, 1),
174
+ "accent_classification": most_common_accent,
175
+ "accent_confidence": round(avg_accent_conf, 1),
176
+ "english_confidence_score": round(english_confidence, 1),
177
+ "samples_analyzed": len(accent_results),
178
+ "consensus": f"{accent_count}/{n_samples} samples",
179
+ "detailed_results": accent_results,
180
+ "summary": (
181
+ f"English speaker detected with {most_common_accent} accent "
182
+ f"(confidence: {english_confidence:.1f}%)"
183
+ )
184
+ }
185
+
186
+ except Exception as e:
187
+ return {"error": f"Processing failed: {str(e)}"}
188
+
189
+ # ─────────────── Gradio UI ───────────────
190
+ def app():
191
+ with gr.Blocks(title="English Accent Detector") as demo:
192
+ gr.Markdown(
193
+ "# πŸŽ™οΈ English Accent Detector\n"
194
+ "**Analyze speaker's accent from video URLs**\n\n"
195
+ "This tool:\n"
196
+ "1. Accepts public video URLs (YouTube, Loom, direct MP4 links)\n"
197
+ "2. Extracts audio from the video\n"
198
+ "3. Analyzes if the speaker is an English language candidate\n"
199
+ "4. Classifies the accent type and provides confidence scores\n"
200
+ )
201
+
202
+ with gr.Row():
203
+ with gr.Column():
204
+ url_input = gr.Text(
205
+ label="Video URL",
206
+ placeholder="Enter public video URL (YouTube, Loom, etc.)",
207
+ lines=1
208
+ )
209
+ samples_input = gr.Slider(
210
+ minimum=1,
211
+ maximum=10,
212
+ value=4,
213
+ step=1,
214
+ label="Number of audio samples to analyze",
215
+ info="More samples = more accurate but slower"
216
+ )
217
+ analyze_btn = gr.Button("πŸ” Analyze Accent", variant="primary")
218
+
219
+ with gr.Column():
220
+ result_output = gr.JSON(
221
+ label="Analysis Results",
222
+ show_label=True
223
+ )
224
+
225
+ # Examples
226
+ gr.Markdown("### Example URLs to try:")
227
+ gr.Examples(
228
+ examples=[
229
+ ["https://www.youtube.com/watch?v=dQw4w9WgXcQ", 4],
230
+ ["https://www.youtube.com/shorts/VO6n9GTzSqU", 4],
231
+ ],
232
+ inputs=[url_input, samples_input],
233
+ label="Click to load example"
234
+ )
235
+
236
+ analyze_btn.click(
237
+ fn=analyse_accent,
238
+ inputs=[url_input, samples_input],
239
+ outputs=result_output
240
+ )
241
+
242
+ return demo
243
+
244
+ if __name__ == "__main__":
245
+ parser = argparse.ArgumentParser(description="English Accent Detector")
246
+ parser.add_argument("--share", action="store_true",
247
+ help="Enable public share link")
248
+ parser.add_argument("--port", type=int, default=7860,
249
+ help="Port to run the server on")
250
+ args = parser.parse_args()
251
+
252
+ demo = app()
253
+ demo.launch(share=args.share, server_port=args.port)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=3.0
2
+ torch
3
+ torchaudio
4
+ speechbrain
5
+ yt-dlp