Spaces:
Sleeping
Sleeping
File size: 1,961 Bytes
5a8c370 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import os, requests, shutil
from pydub import AudioSegment
import whisper
from speechbrain.pretrained.interfaces import foreign_class
class AccentAnalyzerTool:
def __init__(self):
self.whisper_model = whisper.load_model("medium")
self.accent_model = foreign_class(
source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
pymodule_file="custom_interface.py",
classname="CustomEncoderWav2vec2Classifier"
)
self.last_transcript = None
def log(self, msg):
print(f"[AccentAnalyzerTool] {msg}")
def analyze(self, url: str) -> str:
try:
self.log("Downloading video...")
tmp_dir = "tmp"
os.makedirs(tmp_dir, exist_ok=True)
video_path = os.path.join(tmp_dir, "video.mp4")
r = requests.get(url)
with open(video_path, "wb") as f:
f.write(r.content)
self.log("Extracting audio...")
audio_path = os.path.join(tmp_dir, "audio.wav")
AudioSegment.from_file(video_path).export(audio_path, format="wav")
self.log("Classifying accent...")
_, score, _, label = self.accent_model.classify_file(audio_path)
accent = label[0].upper() if label[0] == 'us' else label[0].capitalize()
confidence = round(float(score) * 100, 2)
self.log("Transcribing...")
transcript = self.whisper_model.transcribe(audio_path)["text"]
self.last_transcript = transcript
summary = (
f"The speaker has a **{accent} English accent** "
f"with **{confidence}% confidence**.\n\n"
f"**Transcript of the audio:**\n\n *{transcript.strip(' ')}*"
)
shutil.rmtree(tmp_dir, ignore_errors=True)
return summary
except Exception as e:
return f"Error analyzing accent: {str(e)}"
|