|
|
|
|
|
import torch
|
|
import librosa
|
|
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
|
|
|
|
MODEL_ID = "ylacombe/accent-classifier"
|
|
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
|
|
model = AutoModelForAudioClassification.from_pretrained(MODEL_ID)
|
|
|
|
|
|
label_map = {
|
|
4: "england",
|
|
14: "us"
|
|
}
|
|
|
|
def predict_accent(audio_path: str) -> str:
|
|
audio, sr = librosa.load(audio_path, sr=16000)
|
|
inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt")
|
|
|
|
with torch.no_grad():
|
|
logits = model(**inputs).logits
|
|
predicted_id = torch.argmax(logits, dim=-1).item()
|
|
|
|
return label_map.get(predicted_id, f"Unknown (ID: {predicted_id})")
|
|
|