Spaces:

usamaijaz2824
/

accent-classifier

Build error

App Files Files Community

usamaijaz-ai commited on May 10

Commit

5488aaa

1 Parent(s): 99f88da

initial commit

Browse files

Files changed (6) hide show

.gitignore +96 -0
README.md +77 -6
app.py +161 -0
local.py +161 -0
requirements.txt +9 -0
test.py +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,96 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+env/
+venv/
+ENV/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+.venv/
+venv/
+ENV/
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff
+*.log
+# Flask stuff
+instance/
+.webassets-cache
+# Scrapy stuff
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# IPython
+profile_default/
+ipython_config.py
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre
+.pyre/
+# IDEs
+.vscode/
+.idea/
+# Heavy files
+*.h5
+*.pt
+*.pkl
+*.ckpt
+/model

README.md CHANGED Viewed

@@ -1,12 +1,83 @@
 ---
-title: Accent Classifier
-emoji: 😻
-colorFrom: pink
-colorTo: red
 sdk: gradio
-sdk_version: 5.29.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Accent Classifier + Transcriber
+emoji: 🎙️
+colorFrom: indigo
+colorTo: purple
 sdk: gradio
+sdk_version: "4.20.0"
 app_file: app.py
 pinned: false
 ---
+# Accent Classifier + Speech Transcriber
+This Gradio app allows you to:
+- Upload or link to audio/video files
+- Automatically transcribe the speech (via OpenAI Whisper)
+- Detect the speaker's accent (28-class Wav2Vec2 model)
+- View a top-5 ranked list of likely accents with confidence scores
+---
+## How to Use
+Option 1: Upload an audio file
+- Supported formats: .mp3, .wav
+Option 2: Upload a video file
+- Supported format: .mp4 (audio will be extracted automatically)
+Option 3: Paste a direct .mp4 video URL
+- Must be a direct video file URL (not a webpage)
+- Example: a file hosted on archive.org or a CDN
+---
+## Not Supported
+- Loom, YouTube, Dropbox, or other webpage links (they don't serve real video files)
+- Download the video manually and upload it if needed
+---
+## Models Used
+Transcription:
+- openai/whisper-tiny: https://huggingface.co/openai/whisper-tiny
+Accent Classification:
+- ylacombe/accent-classifier: https://huggingface.co/ylacombe/accent-classifier
+---
+## Dependencies
+Handled automatically in Hugging Face Spaces.
+For local testing:
+pip install gradio transformers torch moviepy requests safetensors soundfile scipy
+You must also install ffmpeg:
+- macOS: brew install ffmpeg
+- Ubuntu: sudo apt install ffmpeg
+- Windows: Download from https://ffmpeg.org/
+---
+## How It Works
+1. Audio is extracted (if input is a video)
+2. Audio is converted to .wav and resampled to 16kHz
+3. Speech is transcribed using Whisper
+4. Accent is classified using a Wav2Vec2 model
+5. Output includes:
+   - Top accent prediction
+   - Confidence score
+   - Top-5 accent list
+   - Full transcription
+---

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import gradio as gr
+import torch
+import shutil
+import requests
+import subprocess
+import soundfile as sf
+from scipy.signal import resample
+from moviepy.editor import VideoFileClip, AudioFileClip
+from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
+# === Constants ===
+TEMP_VIDEO = "temp_video.mp4"
+RAW_AUDIO = "raw_audio_input"
+CONVERTED_AUDIO = "converted_audio.wav"
+MODEL_REPO = "ylacombe/accent-classifier"
+# === load local model
+# MODEL_DIR = "model"
+# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
+# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
+# === Load models ===
+model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
+whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
+LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
+model.eval()
+# === Helpers ===
+def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
+    command = ["ffmpeg", "-y", "-i", input_path, output_path]
+    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    return output_path
+def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
+    clip = VideoFileClip(video_path)
+    if clip.audio is None:
+        raise ValueError("No audio stream found in video.")
+    clip.audio.write_audiofile(output_path)
+    return output_path
+def download_video(url, filename=TEMP_VIDEO):
+    temp_download = "raw_download.mp4"
+    headers = {"User-Agent": "Mozilla/5.0"}
+    r = requests.get(url, headers=headers, stream=True, timeout=15)
+    r.raise_for_status()
+    if not r.headers.get("Content-Type", "").startswith("video/"):
+        raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
+    with open(temp_download, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=8192):
+            f.write(chunk)
+    ffmpeg_cmd = [
+        "ffmpeg", "-y", "-i", temp_download,
+        "-c", "copy", "-movflags", "+faststart", filename
+    ]
+    result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
+        raise RuntimeError("FFmpeg failed to process the video.")
+    os.remove(temp_download)
+    return filename
+def classify_accent(audio_path):
+    waveform, sr = sf.read(audio_path)
+    if len(waveform.shape) > 1:
+        waveform = waveform.mean(axis=1)
+    if sr != 16000:
+        num_samples = int(len(waveform) * 16000 / sr)
+        waveform = resample(waveform, num_samples)
+        sr = 16000
+    inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits[0]
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+    top_idx = torch.argmax(probs).item()
+    top_label = LABELS[top_idx]
+    top_conf = round(probs[top_idx].item(), 4)
+    top5 = torch.topk(probs, k=5)
+    top5_labels = [LABELS[i] for i in top5.indices.tolist()]
+    top5_scores = [round(p, 4) for p in top5.values.tolist()]
+    top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
+    return top_label, top_conf, top5_text
+def transcribe_audio(audio_path):
+    result = whisper(audio_path, return_timestamps=True)
+    return result.get("text", "").strip()
+# === Main Handler ===
+def process_input(audio_file, video_file, video_url):
+    try:
+        audio_path = None
+        if audio_file:
+            shutil.copy(audio_file, RAW_AUDIO)
+            audio_path = convert_to_wav(RAW_AUDIO)
+        elif video_file:
+            shutil.copy(video_file, TEMP_VIDEO)
+            extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
+            audio_path = convert_to_wav(extracted)
+        elif video_url and video_url.strip():
+            if "loom.com" in video_url:
+                return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
+            downloaded = download_video(video_url)
+            extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
+            audio_path = convert_to_wav(extracted)
+        else:
+            return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
+        label, confidence, top5 = classify_accent(audio_path)
+        transcription = transcribe_audio(audio_path)
+        return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
+    except Exception as e:
+        return f"Error: {str(e)}", None, None, None, None, None
+    finally:
+        for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
+            if os.path.exists(f):
+                os.remove(f)
+# === Gradio Interface ===
+interface = gr.Interface(
+    fn=process_input,
+    inputs=[
+        gr.Audio(label="Upload MP3 or WAV", type="filepath"),
+        gr.File(label="Upload MP4 Video", type="filepath"),
+        gr.Textbox(label="Paste Direct .mp4 Video URL")
+    ],
+    outputs=[
+        gr.Text(label="Prediction"),
+        gr.Number(label="Confidence Score"),
+        gr.Text(label="Accent"),
+        gr.Audio(label="Processed Audio", type="filepath"),
+        gr.Text(label="Top 5 Predictions"),
+        gr.Text(label="Transcription")
+    ],
+    title="Accent Classifier + Transcriber",
+    description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
+)
+if __name__ == "__main__":
+    interface.launch()

local.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import gradio as gr
+import torch
+import shutil
+import requests
+import subprocess
+import soundfile as sf
+from scipy.signal import resample
+from moviepy.editor import VideoFileClip, AudioFileClip
+from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
+# === Constants ===
+TEMP_VIDEO = "temp_video.mp4"
+RAW_AUDIO = "raw_audio_input"
+CONVERTED_AUDIO = "converted_audio.wav"
+MODEL_REPO = "ylacombe/accent-classifier"
+# === load local model
+MODEL_DIR = "model"
+model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
+# === Load models ===
+# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
+# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
+whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
+LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
+model.eval()
+# === Helpers ===
+def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
+    command = ["ffmpeg", "-y", "-i", input_path, output_path]
+    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    return output_path
+def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
+    clip = VideoFileClip(video_path)
+    if clip.audio is None:
+        raise ValueError("No audio stream found in video.")
+    clip.audio.write_audiofile(output_path)
+    return output_path
+def download_video(url, filename=TEMP_VIDEO):
+    temp_download = "raw_download.mp4"
+    headers = {"User-Agent": "Mozilla/5.0"}
+    r = requests.get(url, headers=headers, stream=True, timeout=15)
+    r.raise_for_status()
+    if not r.headers.get("Content-Type", "").startswith("video/"):
+        raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
+    with open(temp_download, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=8192):
+            f.write(chunk)
+    ffmpeg_cmd = [
+        "ffmpeg", "-y", "-i", temp_download,
+        "-c", "copy", "-movflags", "+faststart", filename
+    ]
+    result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
+        raise RuntimeError("FFmpeg failed to process the video.")
+    os.remove(temp_download)
+    return filename
+def classify_accent(audio_path):
+    waveform, sr = sf.read(audio_path)
+    if len(waveform.shape) > 1:
+        waveform = waveform.mean(axis=1)
+    if sr != 16000:
+        num_samples = int(len(waveform) * 16000 / sr)
+        waveform = resample(waveform, num_samples)
+        sr = 16000
+    inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits[0]
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+    top_idx = torch.argmax(probs).item()
+    top_label = LABELS[top_idx]
+    top_conf = round(probs[top_idx].item(), 4)
+    top5 = torch.topk(probs, k=5)
+    top5_labels = [LABELS[i] for i in top5.indices.tolist()]
+    top5_scores = [round(p, 4) for p in top5.values.tolist()]
+    top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
+    return top_label, top_conf, top5_text
+def transcribe_audio(audio_path):
+    result = whisper(audio_path, return_timestamps=True)
+    return result.get("text", "").strip()
+# === Main Handler ===
+def process_input(audio_file, video_file, video_url):
+    try:
+        audio_path = None
+        if audio_file:
+            shutil.copy(audio_file, RAW_AUDIO)
+            audio_path = convert_to_wav(RAW_AUDIO)
+        elif video_file:
+            shutil.copy(video_file, TEMP_VIDEO)
+            extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
+            audio_path = convert_to_wav(extracted)
+        elif video_url and video_url.strip():
+            if "loom.com" in video_url:
+                return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
+            downloaded = download_video(video_url)
+            extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
+            audio_path = convert_to_wav(extracted)
+        else:
+            return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
+        label, confidence, top5 = classify_accent(audio_path)
+        transcription = transcribe_audio(audio_path)
+        return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
+    except Exception as e:
+        return f"Error: {str(e)}", None, None, None, None, None
+    finally:
+        for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
+            if os.path.exists(f):
+                os.remove(f)
+# === Gradio Interface ===
+interface = gr.Interface(
+    fn=process_input,
+    inputs=[
+        gr.Audio(label="Upload MP3 or WAV", type="filepath"),
+        gr.File(label="Upload MP4 Video", type="filepath"),
+        gr.Textbox(label="Paste Direct .mp4 Video URL")
+    ],
+    outputs=[
+        gr.Text(label="Prediction"),
+        gr.Number(label="Confidence Score"),
+        gr.Text(label="Accent"),
+        gr.Audio(label="Processed Audio", type="filepath"),
+        gr.Text(label="Top 5 Predictions"),
+        gr.Text(label="Transcription")
+    ],
+    title="Accent Classifier + Transcriber",
+    description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
+)
+if __name__ == "__main__":
+    interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+transformers
+torch
+torchaudio
+gradio
+moviepy==1.0.3
+requests
+safetensors
+soundfile
+scipy

test.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import requests
+url = "https://store3.gofile.io/download/web/7a1f0c47-93e5-45c1-90b3-e05cb8611501/sample-file.mp4"
+r = requests.get(url, allow_redirects=True)
+print("Content-Type:", r.headers.get("Content-Type"))
+print("File size (bytes):", len(r.content))
+print("First 200 bytes:\n", r.content[:200])