Spaces:
Build error
Build error
Commit
·
5488aaa
1
Parent(s):
99f88da
initial commit
Browse files- .gitignore +96 -0
- README.md +77 -6
- app.py +161 -0
- local.py +161 -0
- requirements.txt +9 -0
- test.py +8 -0
.gitignore
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
env/
|
12 |
+
venv/
|
13 |
+
ENV/
|
14 |
+
build/
|
15 |
+
develop-eggs/
|
16 |
+
dist/
|
17 |
+
downloads/
|
18 |
+
eggs/
|
19 |
+
.eggs/
|
20 |
+
lib/
|
21 |
+
lib64/
|
22 |
+
parts/
|
23 |
+
sdist/
|
24 |
+
var/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
|
29 |
+
# Virtual environments
|
30 |
+
.venv/
|
31 |
+
venv/
|
32 |
+
ENV/
|
33 |
+
|
34 |
+
# PyInstaller
|
35 |
+
*.manifest
|
36 |
+
*.spec
|
37 |
+
|
38 |
+
# Installer logs
|
39 |
+
pip-log.txt
|
40 |
+
pip-delete-this-directory.txt
|
41 |
+
|
42 |
+
# Unit test / coverage reports
|
43 |
+
htmlcov/
|
44 |
+
.tox/
|
45 |
+
.nox/
|
46 |
+
.coverage
|
47 |
+
.coverage.*
|
48 |
+
.cache
|
49 |
+
nosetests.xml
|
50 |
+
coverage.xml
|
51 |
+
*.cover
|
52 |
+
.hypothesis/
|
53 |
+
.pytest_cache/
|
54 |
+
|
55 |
+
# Translations
|
56 |
+
*.mo
|
57 |
+
*.pot
|
58 |
+
|
59 |
+
# Django stuff
|
60 |
+
*.log
|
61 |
+
|
62 |
+
# Flask stuff
|
63 |
+
instance/
|
64 |
+
.webassets-cache
|
65 |
+
|
66 |
+
# Scrapy stuff
|
67 |
+
.scrapy
|
68 |
+
|
69 |
+
# Sphinx documentation
|
70 |
+
docs/_build/
|
71 |
+
|
72 |
+
# PyBuilder
|
73 |
+
target/
|
74 |
+
|
75 |
+
# IPython
|
76 |
+
profile_default/
|
77 |
+
ipython_config.py
|
78 |
+
|
79 |
+
# mypy
|
80 |
+
.mypy_cache/
|
81 |
+
.dmypy.json
|
82 |
+
dmypy.json
|
83 |
+
|
84 |
+
# Pyre
|
85 |
+
.pyre/
|
86 |
+
|
87 |
+
# IDEs
|
88 |
+
.vscode/
|
89 |
+
.idea/
|
90 |
+
|
91 |
+
# Heavy files
|
92 |
+
*.h5
|
93 |
+
*.pt
|
94 |
+
*.pkl
|
95 |
+
*.ckpt
|
96 |
+
/model
|
README.md
CHANGED
@@ -1,12 +1,83 @@
|
|
1 |
---
|
2 |
-
title: Accent Classifier
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Accent Classifier + Transcriber
|
3 |
+
emoji: 🎙️
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: "4.20.0"
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
|
13 |
+
# Accent Classifier + Speech Transcriber
|
14 |
+
|
15 |
+
This Gradio app allows you to:
|
16 |
+
|
17 |
+
- Upload or link to audio/video files
|
18 |
+
- Automatically transcribe the speech (via OpenAI Whisper)
|
19 |
+
- Detect the speaker's accent (28-class Wav2Vec2 model)
|
20 |
+
- View a top-5 ranked list of likely accents with confidence scores
|
21 |
+
|
22 |
+
---
|
23 |
+
|
24 |
+
## How to Use
|
25 |
+
|
26 |
+
Option 1: Upload an audio file
|
27 |
+
- Supported formats: .mp3, .wav
|
28 |
+
|
29 |
+
Option 2: Upload a video file
|
30 |
+
- Supported format: .mp4 (audio will be extracted automatically)
|
31 |
+
|
32 |
+
Option 3: Paste a direct .mp4 video URL
|
33 |
+
- Must be a direct video file URL (not a webpage)
|
34 |
+
- Example: a file hosted on archive.org or a CDN
|
35 |
+
|
36 |
+
---
|
37 |
+
|
38 |
+
|
39 |
+
## Not Supported
|
40 |
+
|
41 |
+
- Loom, YouTube, Dropbox, or other webpage links (they don't serve real video files)
|
42 |
+
- Download the video manually and upload it if needed
|
43 |
+
|
44 |
+
---
|
45 |
+
|
46 |
+
## Models Used
|
47 |
+
|
48 |
+
Transcription:
|
49 |
+
- openai/whisper-tiny: https://huggingface.co/openai/whisper-tiny
|
50 |
+
|
51 |
+
Accent Classification:
|
52 |
+
- ylacombe/accent-classifier: https://huggingface.co/ylacombe/accent-classifier
|
53 |
+
|
54 |
+
---
|
55 |
+
|
56 |
+
## Dependencies
|
57 |
+
|
58 |
+
Handled automatically in Hugging Face Spaces.
|
59 |
+
For local testing:
|
60 |
+
|
61 |
+
pip install gradio transformers torch moviepy requests safetensors soundfile scipy
|
62 |
+
|
63 |
+
You must also install ffmpeg:
|
64 |
+
|
65 |
+
- macOS: brew install ffmpeg
|
66 |
+
- Ubuntu: sudo apt install ffmpeg
|
67 |
+
- Windows: Download from https://ffmpeg.org/
|
68 |
+
|
69 |
+
---
|
70 |
+
|
71 |
+
## How It Works
|
72 |
+
|
73 |
+
1. Audio is extracted (if input is a video)
|
74 |
+
2. Audio is converted to .wav and resampled to 16kHz
|
75 |
+
3. Speech is transcribed using Whisper
|
76 |
+
4. Accent is classified using a Wav2Vec2 model
|
77 |
+
5. Output includes:
|
78 |
+
- Top accent prediction
|
79 |
+
- Confidence score
|
80 |
+
- Top-5 accent list
|
81 |
+
- Full transcription
|
82 |
+
|
83 |
+
---
|
app.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import torch
|
4 |
+
import shutil
|
5 |
+
import requests
|
6 |
+
import subprocess
|
7 |
+
import soundfile as sf
|
8 |
+
from scipy.signal import resample
|
9 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
10 |
+
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
|
11 |
+
|
12 |
+
# === Constants ===
|
13 |
+
TEMP_VIDEO = "temp_video.mp4"
|
14 |
+
RAW_AUDIO = "raw_audio_input"
|
15 |
+
CONVERTED_AUDIO = "converted_audio.wav"
|
16 |
+
MODEL_REPO = "ylacombe/accent-classifier"
|
17 |
+
|
18 |
+
# === load local model
|
19 |
+
# MODEL_DIR = "model"
|
20 |
+
# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
|
21 |
+
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
|
22 |
+
|
23 |
+
|
24 |
+
# === Load models ===
|
25 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
|
26 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
|
27 |
+
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
28 |
+
|
29 |
+
LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
|
30 |
+
model.eval()
|
31 |
+
|
32 |
+
# === Helpers ===
|
33 |
+
def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
|
34 |
+
command = ["ffmpeg", "-y", "-i", input_path, output_path]
|
35 |
+
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
36 |
+
return output_path
|
37 |
+
|
38 |
+
def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
|
39 |
+
clip = VideoFileClip(video_path)
|
40 |
+
if clip.audio is None:
|
41 |
+
raise ValueError("No audio stream found in video.")
|
42 |
+
clip.audio.write_audiofile(output_path)
|
43 |
+
return output_path
|
44 |
+
|
45 |
+
def download_video(url, filename=TEMP_VIDEO):
|
46 |
+
temp_download = "raw_download.mp4"
|
47 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
48 |
+
|
49 |
+
r = requests.get(url, headers=headers, stream=True, timeout=15)
|
50 |
+
r.raise_for_status()
|
51 |
+
|
52 |
+
if not r.headers.get("Content-Type", "").startswith("video/"):
|
53 |
+
raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
|
54 |
+
|
55 |
+
with open(temp_download, 'wb') as f:
|
56 |
+
for chunk in r.iter_content(chunk_size=8192):
|
57 |
+
f.write(chunk)
|
58 |
+
|
59 |
+
ffmpeg_cmd = [
|
60 |
+
"ffmpeg", "-y", "-i", temp_download,
|
61 |
+
"-c", "copy", "-movflags", "+faststart", filename
|
62 |
+
]
|
63 |
+
result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
64 |
+
|
65 |
+
if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
|
66 |
+
raise RuntimeError("FFmpeg failed to process the video.")
|
67 |
+
|
68 |
+
os.remove(temp_download)
|
69 |
+
return filename
|
70 |
+
|
71 |
+
def classify_accent(audio_path):
|
72 |
+
waveform, sr = sf.read(audio_path)
|
73 |
+
if len(waveform.shape) > 1:
|
74 |
+
waveform = waveform.mean(axis=1)
|
75 |
+
|
76 |
+
if sr != 16000:
|
77 |
+
num_samples = int(len(waveform) * 16000 / sr)
|
78 |
+
waveform = resample(waveform, num_samples)
|
79 |
+
sr = 16000
|
80 |
+
|
81 |
+
inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
|
82 |
+
with torch.no_grad():
|
83 |
+
outputs = model(**inputs)
|
84 |
+
logits = outputs.logits[0]
|
85 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)
|
86 |
+
|
87 |
+
top_idx = torch.argmax(probs).item()
|
88 |
+
top_label = LABELS[top_idx]
|
89 |
+
top_conf = round(probs[top_idx].item(), 4)
|
90 |
+
|
91 |
+
top5 = torch.topk(probs, k=5)
|
92 |
+
top5_labels = [LABELS[i] for i in top5.indices.tolist()]
|
93 |
+
top5_scores = [round(p, 4) for p in top5.values.tolist()]
|
94 |
+
top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
|
95 |
+
|
96 |
+
return top_label, top_conf, top5_text
|
97 |
+
|
98 |
+
def transcribe_audio(audio_path):
|
99 |
+
result = whisper(audio_path, return_timestamps=True)
|
100 |
+
return result.get("text", "").strip()
|
101 |
+
|
102 |
+
# === Main Handler ===
|
103 |
+
def process_input(audio_file, video_file, video_url):
|
104 |
+
try:
|
105 |
+
audio_path = None
|
106 |
+
|
107 |
+
if audio_file:
|
108 |
+
shutil.copy(audio_file, RAW_AUDIO)
|
109 |
+
audio_path = convert_to_wav(RAW_AUDIO)
|
110 |
+
|
111 |
+
elif video_file:
|
112 |
+
shutil.copy(video_file, TEMP_VIDEO)
|
113 |
+
extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
|
114 |
+
audio_path = convert_to_wav(extracted)
|
115 |
+
|
116 |
+
elif video_url and video_url.strip():
|
117 |
+
if "loom.com" in video_url:
|
118 |
+
return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
|
119 |
+
downloaded = download_video(video_url)
|
120 |
+
extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
|
121 |
+
audio_path = convert_to_wav(extracted)
|
122 |
+
|
123 |
+
|
124 |
+
else:
|
125 |
+
return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
|
126 |
+
|
127 |
+
label, confidence, top5 = classify_accent(audio_path)
|
128 |
+
transcription = transcribe_audio(audio_path)
|
129 |
+
|
130 |
+
return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
|
131 |
+
|
132 |
+
except Exception as e:
|
133 |
+
return f"Error: {str(e)}", None, None, None, None, None
|
134 |
+
|
135 |
+
finally:
|
136 |
+
for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
|
137 |
+
if os.path.exists(f):
|
138 |
+
os.remove(f)
|
139 |
+
|
140 |
+
# === Gradio Interface ===
|
141 |
+
interface = gr.Interface(
|
142 |
+
fn=process_input,
|
143 |
+
inputs=[
|
144 |
+
gr.Audio(label="Upload MP3 or WAV", type="filepath"),
|
145 |
+
gr.File(label="Upload MP4 Video", type="filepath"),
|
146 |
+
gr.Textbox(label="Paste Direct .mp4 Video URL")
|
147 |
+
],
|
148 |
+
outputs=[
|
149 |
+
gr.Text(label="Prediction"),
|
150 |
+
gr.Number(label="Confidence Score"),
|
151 |
+
gr.Text(label="Accent"),
|
152 |
+
gr.Audio(label="Processed Audio", type="filepath"),
|
153 |
+
gr.Text(label="Top 5 Predictions"),
|
154 |
+
gr.Text(label="Transcription")
|
155 |
+
],
|
156 |
+
title="Accent Classifier + Transcriber",
|
157 |
+
description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
|
158 |
+
)
|
159 |
+
|
160 |
+
if __name__ == "__main__":
|
161 |
+
interface.launch()
|
local.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import torch
|
4 |
+
import shutil
|
5 |
+
import requests
|
6 |
+
import subprocess
|
7 |
+
import soundfile as sf
|
8 |
+
from scipy.signal import resample
|
9 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
10 |
+
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
|
11 |
+
|
12 |
+
# === Constants ===
|
13 |
+
TEMP_VIDEO = "temp_video.mp4"
|
14 |
+
RAW_AUDIO = "raw_audio_input"
|
15 |
+
CONVERTED_AUDIO = "converted_audio.wav"
|
16 |
+
MODEL_REPO = "ylacombe/accent-classifier"
|
17 |
+
|
18 |
+
# === load local model
|
19 |
+
MODEL_DIR = "model"
|
20 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
|
21 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
|
22 |
+
|
23 |
+
|
24 |
+
# === Load models ===
|
25 |
+
# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
|
26 |
+
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
|
27 |
+
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
28 |
+
|
29 |
+
LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
|
30 |
+
model.eval()
|
31 |
+
|
32 |
+
# === Helpers ===
|
33 |
+
def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
|
34 |
+
command = ["ffmpeg", "-y", "-i", input_path, output_path]
|
35 |
+
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
36 |
+
return output_path
|
37 |
+
|
38 |
+
def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
|
39 |
+
clip = VideoFileClip(video_path)
|
40 |
+
if clip.audio is None:
|
41 |
+
raise ValueError("No audio stream found in video.")
|
42 |
+
clip.audio.write_audiofile(output_path)
|
43 |
+
return output_path
|
44 |
+
|
45 |
+
def download_video(url, filename=TEMP_VIDEO):
|
46 |
+
temp_download = "raw_download.mp4"
|
47 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
48 |
+
|
49 |
+
r = requests.get(url, headers=headers, stream=True, timeout=15)
|
50 |
+
r.raise_for_status()
|
51 |
+
|
52 |
+
if not r.headers.get("Content-Type", "").startswith("video/"):
|
53 |
+
raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
|
54 |
+
|
55 |
+
with open(temp_download, 'wb') as f:
|
56 |
+
for chunk in r.iter_content(chunk_size=8192):
|
57 |
+
f.write(chunk)
|
58 |
+
|
59 |
+
ffmpeg_cmd = [
|
60 |
+
"ffmpeg", "-y", "-i", temp_download,
|
61 |
+
"-c", "copy", "-movflags", "+faststart", filename
|
62 |
+
]
|
63 |
+
result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
64 |
+
|
65 |
+
if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
|
66 |
+
raise RuntimeError("FFmpeg failed to process the video.")
|
67 |
+
|
68 |
+
os.remove(temp_download)
|
69 |
+
return filename
|
70 |
+
|
71 |
+
def classify_accent(audio_path):
|
72 |
+
waveform, sr = sf.read(audio_path)
|
73 |
+
if len(waveform.shape) > 1:
|
74 |
+
waveform = waveform.mean(axis=1)
|
75 |
+
|
76 |
+
if sr != 16000:
|
77 |
+
num_samples = int(len(waveform) * 16000 / sr)
|
78 |
+
waveform = resample(waveform, num_samples)
|
79 |
+
sr = 16000
|
80 |
+
|
81 |
+
inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
|
82 |
+
with torch.no_grad():
|
83 |
+
outputs = model(**inputs)
|
84 |
+
logits = outputs.logits[0]
|
85 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)
|
86 |
+
|
87 |
+
top_idx = torch.argmax(probs).item()
|
88 |
+
top_label = LABELS[top_idx]
|
89 |
+
top_conf = round(probs[top_idx].item(), 4)
|
90 |
+
|
91 |
+
top5 = torch.topk(probs, k=5)
|
92 |
+
top5_labels = [LABELS[i] for i in top5.indices.tolist()]
|
93 |
+
top5_scores = [round(p, 4) for p in top5.values.tolist()]
|
94 |
+
top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
|
95 |
+
|
96 |
+
return top_label, top_conf, top5_text
|
97 |
+
|
98 |
+
def transcribe_audio(audio_path):
|
99 |
+
result = whisper(audio_path, return_timestamps=True)
|
100 |
+
return result.get("text", "").strip()
|
101 |
+
|
102 |
+
# === Main Handler ===
|
103 |
+
def process_input(audio_file, video_file, video_url):
|
104 |
+
try:
|
105 |
+
audio_path = None
|
106 |
+
|
107 |
+
if audio_file:
|
108 |
+
shutil.copy(audio_file, RAW_AUDIO)
|
109 |
+
audio_path = convert_to_wav(RAW_AUDIO)
|
110 |
+
|
111 |
+
elif video_file:
|
112 |
+
shutil.copy(video_file, TEMP_VIDEO)
|
113 |
+
extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
|
114 |
+
audio_path = convert_to_wav(extracted)
|
115 |
+
|
116 |
+
elif video_url and video_url.strip():
|
117 |
+
if "loom.com" in video_url:
|
118 |
+
return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
|
119 |
+
downloaded = download_video(video_url)
|
120 |
+
extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
|
121 |
+
audio_path = convert_to_wav(extracted)
|
122 |
+
|
123 |
+
|
124 |
+
else:
|
125 |
+
return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
|
126 |
+
|
127 |
+
label, confidence, top5 = classify_accent(audio_path)
|
128 |
+
transcription = transcribe_audio(audio_path)
|
129 |
+
|
130 |
+
return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
|
131 |
+
|
132 |
+
except Exception as e:
|
133 |
+
return f"Error: {str(e)}", None, None, None, None, None
|
134 |
+
|
135 |
+
finally:
|
136 |
+
for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
|
137 |
+
if os.path.exists(f):
|
138 |
+
os.remove(f)
|
139 |
+
|
140 |
+
# === Gradio Interface ===
|
141 |
+
interface = gr.Interface(
|
142 |
+
fn=process_input,
|
143 |
+
inputs=[
|
144 |
+
gr.Audio(label="Upload MP3 or WAV", type="filepath"),
|
145 |
+
gr.File(label="Upload MP4 Video", type="filepath"),
|
146 |
+
gr.Textbox(label="Paste Direct .mp4 Video URL")
|
147 |
+
],
|
148 |
+
outputs=[
|
149 |
+
gr.Text(label="Prediction"),
|
150 |
+
gr.Number(label="Confidence Score"),
|
151 |
+
gr.Text(label="Accent"),
|
152 |
+
gr.Audio(label="Processed Audio", type="filepath"),
|
153 |
+
gr.Text(label="Top 5 Predictions"),
|
154 |
+
gr.Text(label="Transcription")
|
155 |
+
],
|
156 |
+
title="Accent Classifier + Transcriber",
|
157 |
+
description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
|
158 |
+
)
|
159 |
+
|
160 |
+
if __name__ == "__main__":
|
161 |
+
interface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
torch
|
3 |
+
torchaudio
|
4 |
+
gradio
|
5 |
+
moviepy==1.0.3
|
6 |
+
requests
|
7 |
+
safetensors
|
8 |
+
soundfile
|
9 |
+
scipy
|
test.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
|
3 |
+
url = "https://store3.gofile.io/download/web/7a1f0c47-93e5-45c1-90b3-e05cb8611501/sample-file.mp4"
|
4 |
+
r = requests.get(url, allow_redirects=True)
|
5 |
+
|
6 |
+
print("Content-Type:", r.headers.get("Content-Type"))
|
7 |
+
print("File size (bytes):", len(r.content))
|
8 |
+
print("First 200 bytes:\n", r.content[:200])
|