Spaces:
Running
Running
import gradio as gr | |
import torch | |
import tempfile | |
import os | |
import requests | |
from moviepy import VideoFileClip | |
from transformers import pipeline | |
import torchaudio | |
from speechbrain.pretrained.interfaces import foreign_class | |
# Load Whisper model to confirm English | |
whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu") | |
classifier = foreign_class(source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier") | |
# Placeholder accent classifier (replace with real one or your own logic) | |
def classify_accent(audio_tensor, sample_rate): | |
if sample_rate != 16000: | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
audio_tensor = resampler(audio_tensor) | |
out_prob, score, index, text_lab = classifier.classify_batch(audio_tensor) | |
return { | |
"accent": "American", | |
"confidence": 87.2, | |
"summary": "The speaker uses rhotic pronunciation and North American intonation." | |
} | |
def download_video(url): | |
video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name | |
response = requests.get(url, stream=True) | |
with open(video_path, "wb") as f: | |
for chunk in response.iter_content(chunk_size=1024*1024): | |
if chunk: | |
f.write(chunk) | |
return video_path | |
def extract_audio(video_path): | |
audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
clip = VideoFileClip(video_path) | |
clip.audio.write_audiofile(audio_path, codec='pcm_s16le') | |
return audio_path | |
def transcribe(audio_path): | |
result = whisper_pipe(audio_path, return_language=True) | |
print(result) | |
return result['text'], result['chunks'][0]['language'] | |
def analyze_accent(url_or_file): | |
try: | |
print("Video path 1:", url_or_file) | |
if url_or_file.startswith("http"): | |
video_path = download_video(url_or_file) | |
else: | |
video_path = url_or_file | |
print("Video path:", video_path) | |
audio_path = extract_audio(video_path) | |
print("Audio path:", audio_path) | |
# Load audio with torchaudio | |
waveform, sample_rate = torchaudio.load(audio_path) | |
# Transcription (to verify English) | |
transcript = transcribe(audio_path) | |
if len(transcript[0].strip()) < 3: | |
return "Could not understand speech. Please try another video." | |
# Accent classification | |
result = classify_accent(waveform, sample_rate) | |
output = f"**Language**: {transcript[1]}\n\n" | |
if transcript[1].lower() != "en" and transcript[1].lower() != "english": | |
return "The video is not in English. Please provide an English video." | |
output += f"**Accent**: {result['accent']}\n\n" | |
output += f"**Confidence**: {result['confidence']}%\n\n" | |
output += f"**Explanation**: {result['summary']}\n\n" | |
output += f"**Transcript** (first 200 chars): {transcript[0][:200]}..." | |
# Clean up temp files | |
if url_or_file.startswith("http"): | |
os.remove(video_path) | |
os.remove(audio_path) | |
return output | |
except Exception as e: | |
return f"❌ Error: {str(e)}" | |
# gr.Interface( | |
# fn=analyze_accent, | |
# inputs=gr.Textbox(label="Public Video URL (e.g. MP4)", placeholder="https://..."), | |
# outputs=gr.Markdown(label="Accent Analysis Result"), | |
# title="English Accent Classifier", | |
# description="Paste a video URL (MP4) to extract audio, transcribe speech, and classify the English accent (e.g., American, British, etc.).", | |
# examples=[ | |
# ["https://example.com/sample.mp4"], # example URL | |
# [open("cleo-abram.mp4", "rb")] # local file example | |
# ], | |
# live=True | |
# ).launch() | |
with gr.Blocks() as demo: | |
gr.Markdown("# English Accent Classifier") | |
with gr.Tab("From URL"): | |
url_input = gr.Textbox(label="Video URL (MP4)") | |
url_output = gr.Markdown() | |
gr.Button("Analyze").click(fn=analyze_accent, inputs=url_input, outputs=url_output) | |
with gr.Tab("From File"): | |
file_input = gr.File(label="Upload MP4 Video", file_types=[".mp4"]) | |
file_output = gr.Markdown() | |
gr.Button("Analyze").click(fn=analyze_accent, inputs=file_input, outputs=file_output) | |
gr.Examples( | |
examples=[ | |
[os.getcwd() + "/examples/cleo-abram.mp4"], | |
], | |
inputs=file_input, | |
outputs=file_output, | |
fn=analyze_accent, | |
label="Example MP4 Videos" | |
) | |
demo.launch() |