Spaces:
Running
Running
import gradio as gr | |
import torch | |
import tempfile | |
import os | |
import requests | |
from moviepy import VideoFileClip | |
from transformers import pipeline | |
import torchaudio | |
from speechbrain.pretrained.interfaces import foreign_class | |
from transformers import WhisperForConditionalGeneration, WhisperProcessor | |
# Load Whisper model to confirm English | |
whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu") | |
# Loading accent classifier | |
classifier = foreign_class(source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier") | |
# these are for fallback in case transformer's whisper-tiny doesn't return language | |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") | |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") | |
ACCENT_LABELS = { | |
"us": "American Accent", | |
"england": "British Accent", | |
"australia": "Australian Accent", | |
"indian": "Indian Accent", | |
"canada": "Canadian Accent", | |
"bermuda": "Bermudian Accent", | |
"scotland": "Scottish Accent", | |
"african": "African Accent", | |
"ireland": "Irish Accent", | |
"newzealand": "New Zealand Accent", | |
"wales": "Welsh Accent", | |
"malaysia": "Malaysian Accent", | |
"philippines": "Philippine Accent", | |
"singapore": "Singaporean Accent", | |
"hongkong": "Hong Kong Accent", | |
"southatlandtic": "South Atlantic Accent" | |
} | |
def classify_accent(audio_tensor, sample_rate): | |
if sample_rate != 16000: | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
audio_tensor = resampler(audio_tensor) | |
out_prob, score, index, text_lab = classifier.classify_batch(audio_tensor) | |
print(out_prob, score, index, text_lab) | |
accent_label = text_lab[0] | |
readable_accent = ACCENT_LABELS.get(accent_label, accent_label.title() + " Accent") | |
return { | |
"accent": readable_accent, | |
"confidence": round(score[0].item() * 100, 2), | |
"summary": f"The speaker is predicted to have a {readable_accent} with {round(score[0].item() * 100, 2)}% confidence." | |
} | |
def download_video(url): | |
video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name | |
response = requests.get(url, stream=True) | |
with open(video_path, "wb") as f: | |
for chunk in response.iter_content(chunk_size=1024*1024): | |
if chunk: | |
f.write(chunk) | |
return video_path | |
def extract_audio(video_path): | |
audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
clip = VideoFileClip(video_path) | |
clip.audio.write_audiofile(audio_path, codec='pcm_s16le') | |
return audio_path | |
def detect_language(audio_path): | |
audio, sr = torchaudio.load(audio_path) | |
inputs = processor(audio[0], sampling_rate=sr, return_tensors="pt") | |
logits = model.forward(**inputs).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
decoded = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True) | |
return decoded[0] # crude approximation | |
def transcribe(audio_path): | |
result = whisper_pipe(audio_path, return_language=True) | |
print(result) | |
lang = result['chunks'][0]['language'] | |
if lang == None: | |
lang = detect_language(audio_path) | |
return result['text'], lang | |
def analyze_accent(url_or_file): | |
try: | |
print("Video path 1:", url_or_file) | |
if url_or_file.startswith("http"): | |
video_path = download_video(url_or_file) | |
else: | |
video_path = url_or_file | |
print("Video path:", video_path) | |
audio_path = extract_audio(video_path) | |
print("Audio path:", audio_path) | |
# Load audio with torchaudio | |
waveform, sample_rate = torchaudio.load(audio_path) | |
# Transcription (to verify English) | |
transcript = transcribe(audio_path) | |
if len(transcript[0].strip()) < 3: | |
return "Could not understand speech. Please try another video." | |
print("Transcript:", transcript) | |
# Accent classification | |
result = classify_accent(waveform, sample_rate) | |
output = f"**Language**: {transcript[1]}\n\n" | |
if transcript[1].lower() != "en" and transcript[1].lower() != "english": | |
return "The video is not in English. Please provide an English video." | |
output += f"**Accent**: {result['accent']}\n\n" | |
output += f"**Confidence**: {result['confidence']}%\n\n" | |
output += f"**Explanation**: {result['summary']}\n\n" | |
output += f"**Transcript** (first 200 chars): {transcript[0][:200]}..." | |
# Clean up temp files | |
if url_or_file.startswith("http"): | |
os.remove(video_path) | |
os.remove(audio_path) | |
return output | |
except Exception as e: | |
return f"❌ Error: {str(e)}" | |
with gr.Blocks() as demo: | |
gr.Markdown(""" | |
# English Accent Classifier! | |
### How it works? | |
- Takes video URL or video file | |
- Converts it into audio | |
- Uses `Whisper-tiny` to detect which language is being spoken | |
- If the detected language is English, it uses SpeechBrain's Accent ID classifier to show the speaker's accent along with a confidence score. | |
**Q: What if my transformers version doesn't expose `return_language` for `whisper-tiny`?** | |
A: Then it will approximate the language by counting which language's tokens it is using the most. | |
""") | |
with gr.Tab("From URL"): | |
url_input = gr.Textbox(label="Video URL (MP4)") | |
url_output = gr.Markdown("""### Output will be shown here!""", elem_classes="output-box") | |
gr.Button("Analyze").click(fn=analyze_accent, inputs=url_input, outputs=url_output) | |
gr.Examples( | |
examples=[["https://huggingface.co/spaces/fahadqazi/accent-classifier/resolve/main/examples/american.mp4"], ["https://huggingface.co/spaces/fahadqazi/accent-classifier/resolve/main/examples/british.mp4"]], | |
inputs=[url_input], | |
outputs=[url_output], | |
label="Example MP4 Video URLs", | |
examples_per_page=5 | |
) | |
with gr.Tab("From File"): | |
file_input = gr.File(label="Upload MP4 Video", file_types=[".mp4"]) | |
file_output = gr.Markdown("""### Output will be shown here!""", elem_classes="output-box") | |
gr.Button("Analyze").click(fn=analyze_accent, inputs=file_input, outputs=file_output) | |
gr.Examples( | |
examples=[[os.getcwd() + "/examples/american.mp4"], [os.getcwd() + "/examples/british.mp4"]], | |
inputs=[file_input], | |
outputs=[file_output], | |
label="Example MP4 Videos", | |
examples_per_page=5 | |
) | |
demo.css = """ | |
.output-box { | |
min-height: 70px; | |
overflow-y: auto; | |
padding: 10px; | |
} | |
""" | |
demo.launch() |