import os import sys import subprocess import tempfile import requests from moviepy.editor import VideoFileClip try: import whisper if not hasattr(whisper, 'load_model'): raise ImportError except ImportError: subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "openai-whisper"], check=True) import whisper import torch import librosa import pandas as pd from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification from huggingface_hub import login import gradio as gr # device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cpu' def load_models(): whisper_model = whisper.load_model('tiny', device=device) processor = Wav2Vec2Processor.from_pretrained( 'jonatasgrosman/wav2vec2-large-english' ) accent_model = Wav2Vec2ForSequenceClassification.from_pretrained( 'jonatasgrosman/wav2vec2-large-english' ).to(device) accent_model = torch.quantization.quantize_dynamic( accent_model, {torch.nn.Linear}, dtype=torch.qint8 ) return whisper_model, processor, accent_model whisper_model, processor, accent_model = load_models() def analyze(video_url: str): with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_vid: response = requests.get(video_url, stream=True) response.raise_for_status() for chunk in response.iter_content(chunk_size=1024 * 1024): if chunk: tmp_vid.write(chunk) video_path = tmp_vid.name audio_path = video_path.replace('.mp4', '.wav') clip = VideoFileClip(video_path) clip.audio.write_audiofile(audio_path, verbose=False, logger=None) clip.close() speech, sr = librosa.load(audio_path, sr=16000) result = whisper_model.transcribe(speech) transcript = result.get('text', '') lang = result.get('language', 'unknown') if lang != 'en': transcript = f"[Non-English detected: {lang}]\n" + transcript inputs = processor(speech, sampling_rate=sr, return_tensors='pt', padding=True) input_values = inputs.input_values.to(device) attention_mask = inputs.attention_mask.to(device) with torch.no_grad(): logits = accent_model(input_values=input_values, attention_mask=attention_mask).logits probs = torch.softmax(logits, dim=-1).squeeze().cpu().tolist() accent_labels = [ 'American', 'Australian', 'British', 'Canadian', 'Indian', 'Irish', 'New Zealander', 'South African', 'Welsh' ] accent_probs = [(accent_labels[i], probs[i] * 100) for i in range(len(probs))] accent_probs.sort(key=lambda x: x[1], reverse=True) top_accent, top_conf = accent_probs[0] df = pd.DataFrame(accent_probs, columns=['Accent', 'Confidence (%)']) df = pd.DataFrame(accent_probs, columns=['Accent', 'Confidence (%)']) try: os.remove(video_path) os.remove(audio_path) except: pass return top_accent, f"{top_conf:.2f}%", df interface = gr.Interface( fn=analyze, inputs=gr.Textbox(label='Video URL', placeholder='Enter public MP4 URL'), outputs=[ # gr.Textbox(label='Transcript'), gr.Textbox(label='Predicted Accent'), gr.Textbox(label='Accent Confidence'), gr.Dataframe(label='All Accent Probabilities') ], title='English Accent Detector', description='Paste a direct MP4 URL to extract, transcribe, and classify English accents. It is a bit slow since we run Whisper and Wav2Vec2 models on CPU. Please test with short videos.', examples=[ ['http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4'], ], allow_flagging='never' ) if __name__ == '__main__': interface.launch()