import torch import librosa from speechbrain.inference.classifiers import EncoderClassifier from pydub import AudioSegment import gradio as gr import os # Load model only once classifier = EncoderClassifier.from_hparams( source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa" ) def classify_accent(video): # 'video' will already be a path to the uploaded file audio = AudioSegment.from_file(video, format="mp4") audio.export("output.wav", format="wav") waveform, sr = librosa.load("output.wav", sr=16000, mono=True) waveform_tensor = torch.tensor(waveform).unsqueeze(0) prediction = classifier.classify_batch(waveform_tensor) _, score, _, text_lab = prediction return f"Accent: {text_lab[0]} (Confidence: {score.item():.2f})" iface = gr.Interface(fn=classify_accent, inputs=gr.Video(), outputs="text") if __name__ == "__main__": iface.launch()