Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import tempfile | |
| import os | |
| import requests | |
| from moviepy import VideoFileClip | |
| from transformers import pipeline | |
| import torchaudio | |
| from speechbrain.pretrained.interfaces import foreign_class | |
| from transformers import WhisperForConditionalGeneration, WhisperProcessor | |
| # Load Whisper model to confirm English | |
| whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu") | |
| classifier = foreign_class(source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier") | |
| model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") | |
| processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") | |
| ACCENT_LABELS = { | |
| "us": "American Accent", | |
| "england": "British Accent", | |
| "australia": "Australian Accent", | |
| "indian": "Indian Accent", | |
| "canada": "Canadian Accent", | |
| "bermuda": "Bermudian Accent", | |
| "scotland": "Scottish Accent", | |
| "african": "African Accent", | |
| "ireland": "Irish Accent", | |
| "newzealand": "New Zealand Accent", | |
| "wales": "Welsh Accent", | |
| "malaysia": "Malaysian Accent", | |
| "philippines": "Philippine Accent", | |
| "singapore": "Singaporean Accent", | |
| "hongkong": "Hong Kong Accent", | |
| "southatlandtic": "South Atlantic Accent" | |
| } | |
| # Placeholder accent classifier (replace with real one or your own logic) | |
| def classify_accent(audio_tensor, sample_rate): | |
| if sample_rate != 16000: | |
| resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
| audio_tensor = resampler(audio_tensor) | |
| out_prob, score, index, text_lab = classifier.classify_batch(audio_tensor) | |
| print(out_prob, score, index, text_lab) | |
| accent_label = text_lab[0] | |
| readable_accent = ACCENT_LABELS.get(accent_label, accent_label.title() + " Accent") | |
| return { | |
| "accent": readable_accent, | |
| "confidence": round(score[0].item() * 100, 2), | |
| "summary": f"The speaker is predicted to have a {readable_accent} with {round(score[0].item() * 100, 2)}% confidence." | |
| } | |
| def download_video(url): | |
| video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name | |
| response = requests.get(url, stream=True) | |
| with open(video_path, "wb") as f: | |
| for chunk in response.iter_content(chunk_size=1024*1024): | |
| if chunk: | |
| f.write(chunk) | |
| return video_path | |
| def extract_audio(video_path): | |
| audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
| clip = VideoFileClip(video_path) | |
| clip.audio.write_audiofile(audio_path, codec='pcm_s16le') | |
| return audio_path | |
| def detect_language(audio_path): | |
| audio, sr = torchaudio.load(audio_path) | |
| inputs = processor(audio[0], sampling_rate=sr, return_tensors="pt") | |
| logits = model.forward(**inputs).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| decoded = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True) | |
| return decoded[0] # crude approximation | |
| def transcribe(audio_path): | |
| result = whisper_pipe(audio_path, return_language=True) | |
| print(result) | |
| lang = result['chunks'][0]['language'] | |
| if lang == None: | |
| lang = detect_language(audio_path) | |
| return result['text'], lang | |
| def analyze_accent(url_or_file): | |
| try: | |
| print("Video path 1:", url_or_file) | |
| if url_or_file.startswith("http"): | |
| video_path = download_video(url_or_file) | |
| else: | |
| video_path = url_or_file | |
| print("Video path:", video_path) | |
| audio_path = extract_audio(video_path) | |
| print("Audio path:", audio_path) | |
| # Load audio with torchaudio | |
| waveform, sample_rate = torchaudio.load(audio_path) | |
| # Transcription (to verify English) | |
| transcript = transcribe(audio_path) | |
| if len(transcript[0].strip()) < 3: | |
| return "Could not understand speech. Please try another video." | |
| print("Transcript:", transcript) | |
| # Accent classification | |
| result = classify_accent(waveform, sample_rate) | |
| output = f"**Language**: {transcript[1]}\n\n" | |
| if transcript[1].lower() != "en" and transcript[1].lower() != "english": | |
| return "The video is not in English. Please provide an English video." | |
| output += f"**Accent**: {result['accent']}\n\n" | |
| output += f"**Confidence**: {result['confidence']}%\n\n" | |
| output += f"**Explanation**: {result['summary']}\n\n" | |
| output += f"**Transcript** (first 200 chars): {transcript[0][:200]}..." | |
| # Clean up temp files | |
| if url_or_file.startswith("http"): | |
| os.remove(video_path) | |
| os.remove(audio_path) | |
| return output | |
| except Exception as e: | |
| return f"❌ Error: {str(e)}" | |
| # gr.Interface( | |
| # fn=analyze_accent, | |
| # inputs=gr.Textbox(label="Public Video URL (e.g. MP4)", placeholder="https://..."), | |
| # outputs=gr.Markdown(label="Accent Analysis Result"), | |
| # title="English Accent Classifier", | |
| # description="Paste a video URL (MP4) to extract audio, transcribe speech, and classify the English accent (e.g., American, British, etc.).", | |
| # examples=[ | |
| # ["https://example.com/sample.mp4"], # example URL | |
| # [open("cleo-abram.mp4", "rb")] # local file example | |
| # ], | |
| # live=True | |
| # ).launch() | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# English Accent Classifier") | |
| with gr.Tab("From URL"): | |
| url_input = gr.Textbox(label="Video URL (MP4)") | |
| url_output = gr.Markdown("""### Output will be shown here!""", elem_classes="output-box") | |
| gr.Button("Analyze").click(fn=analyze_accent, inputs=url_input, outputs=url_output) | |
| gr.Examples( | |
| examples=[["https://huggingface.co/spaces/fahadqazi/accent-classifier/resolve/main/examples/american.mp4"], ["https://huggingface.co/spaces/fahadqazi/accent-classifier/resolve/main/examples/british.mp4"]], | |
| inputs=[url_input], | |
| outputs=[url_output], | |
| label="Example MP4 Video URLs", | |
| examples_per_page=5 | |
| ) | |
| with gr.Tab("From File"): | |
| file_input = gr.File(label="Upload MP4 Video", file_types=[".mp4"]) | |
| file_output = gr.Markdown("""### Output will be shown here!""", elem_classes="output-box") | |
| gr.Button("Analyze").click(fn=analyze_accent, inputs=file_input, outputs=file_output) | |
| gr.Examples( | |
| examples=[[os.getcwd() + "/examples/american.mp4"], [os.getcwd() + "/examples/british.mp4"]], | |
| inputs=[file_input], | |
| outputs=[file_output], | |
| label="Example MP4 Videos", | |
| examples_per_page=5 | |
| ) | |
| demo.css = """ | |
| .output-box { | |
| min-height: 100px; | |
| overflow-y: auto; | |
| padding: 10px; | |
| } | |
| """ | |
| demo.launch() |