import gradio as gr import torch import tempfile import os import requests from moviepy import VideoFileClip from transformers import pipeline import torchaudio from speechbrain.pretrained.interfaces import foreign_class from transformers import WhisperForConditionalGeneration, WhisperProcessor # Load Whisper model to confirm English whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu") # Loading accent classifier classifier = foreign_class(source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier") # these are for fallback in case transformer's whisper-tiny doesn't return language model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") ACCENT_LABELS = { "us": "American Accent", "england": "British Accent", "australia": "Australian Accent", "indian": "Indian Accent", "canada": "Canadian Accent", "bermuda": "Bermudian Accent", "scotland": "Scottish Accent", "african": "African Accent", "ireland": "Irish Accent", "newzealand": "New Zealand Accent", "wales": "Welsh Accent", "malaysia": "Malaysian Accent", "philippines": "Philippine Accent", "singapore": "Singaporean Accent", "hongkong": "Hong Kong Accent", "southatlandtic": "South Atlantic Accent" } def classify_accent(audio_tensor, sample_rate): if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) audio_tensor = resampler(audio_tensor) out_prob, score, index, text_lab = classifier.classify_batch(audio_tensor) print(out_prob, score, index, text_lab) accent_label = text_lab[0] readable_accent = ACCENT_LABELS.get(accent_label, accent_label.title() + " Accent") return { "accent": readable_accent, "confidence": round(score[0].item() * 100, 2), "summary": f"The speaker is predicted to have a {readable_accent} with {round(score[0].item() * 100, 2)}% confidence." } def download_video(url): video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name response = requests.get(url, stream=True) with open(video_path, "wb") as f: for chunk in response.iter_content(chunk_size=1024*1024): if chunk: f.write(chunk) return video_path def extract_audio(video_path): audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name clip = VideoFileClip(video_path) clip.audio.write_audiofile(audio_path, codec='pcm_s16le') return audio_path def detect_language(audio_path): audio, sr = torchaudio.load(audio_path) inputs = processor(audio[0], sampling_rate=sr, return_tensors="pt") logits = model.forward(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) decoded = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True) return decoded[0] # crude approximation def transcribe(audio_path): result = whisper_pipe(audio_path, return_language=True) print(result) lang = result['chunks'][0]['language'] if lang == None: lang = detect_language(audio_path) return result['text'], lang def analyze_accent(url_or_file): try: print("Video path 1:", url_or_file) if url_or_file.startswith("http"): video_path = download_video(url_or_file) else: video_path = url_or_file print("Video path:", video_path) audio_path = extract_audio(video_path) print("Audio path:", audio_path) # Load audio with torchaudio waveform, sample_rate = torchaudio.load(audio_path) # Transcription (to verify English) transcript = transcribe(audio_path) if len(transcript[0].strip()) < 3: return "Could not understand speech. Please try another video." print("Transcript:", transcript) # Accent classification result = classify_accent(waveform, sample_rate) output = f"**Language**: {transcript[1]}\n\n" if transcript[1].lower() != "en" and transcript[1].lower() != "english": return "The video is not in English. Please provide an English video." output += f"**Accent**: {result['accent']}\n\n" output += f"**Confidence**: {result['confidence']}%\n\n" output += f"**Explanation**: {result['summary']}\n\n" output += f"**Transcript** (first 200 chars): {transcript[0][:200]}..." # Clean up temp files if url_or_file.startswith("http"): os.remove(video_path) os.remove(audio_path) return output except Exception as e: return f"❌ Error: {str(e)}" with gr.Blocks() as demo: gr.Markdown(""" # English Accent Classifier! ### How it works? - Takes video URL or video file - Converts it into audio - Uses `Whisper-tiny` to detect which language is being spoken - If the detected language is English, it uses SpeechBrain's Accent ID classifier to show the speaker's accent along with a confidence score. **Q: What if my transformers version doesn't expose `return_language` for `whisper-tiny`?** A: Then it will approximate the language by counting which language's tokens it is using the most. """) with gr.Tab("From URL"): url_input = gr.Textbox(label="Video URL (MP4)") url_output = gr.Markdown("""### Output will be shown here!""", elem_classes="output-box") gr.Button("Analyze").click(fn=analyze_accent, inputs=url_input, outputs=url_output) gr.Examples( examples=[["https://huggingface.co/spaces/fahadqazi/accent-classifier/resolve/main/examples/american.mp4"], ["https://huggingface.co/spaces/fahadqazi/accent-classifier/resolve/main/examples/british.mp4"]], inputs=[url_input], outputs=[url_output], label="Example MP4 Video URLs", examples_per_page=5 ) with gr.Tab("From File"): file_input = gr.File(label="Upload MP4 Video", file_types=[".mp4"]) file_output = gr.Markdown("""### Output will be shown here!""", elem_classes="output-box") gr.Button("Analyze").click(fn=analyze_accent, inputs=file_input, outputs=file_output) gr.Examples( examples=[[os.getcwd() + "/examples/american.mp4"], [os.getcwd() + "/examples/british.mp4"]], inputs=[file_input], outputs=[file_output], label="Example MP4 Videos", examples_per_page=5 ) demo.css = """ .output-box { min-height: 70px; overflow-y: auto; padding: 10px; } """ demo.launch()