Spaces:
Sleeping
Sleeping
Commit
·
c8c0038
1
Parent(s):
09a8733
updated code to add transcription
Browse files
app.py
CHANGED
|
@@ -7,106 +7,78 @@ import subprocess
|
|
| 7 |
import soundfile as sf
|
| 8 |
from scipy.signal import resample
|
| 9 |
from moviepy.editor import VideoFileClip, AudioFileClip
|
| 10 |
-
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
|
| 11 |
|
| 12 |
# === Constants ===
|
| 13 |
TEMP_VIDEO = "temp_video.mp4"
|
| 14 |
RAW_AUDIO = "raw_audio_input"
|
| 15 |
CONVERTED_AUDIO = "converted_audio.wav"
|
| 16 |
-
|
| 17 |
|
| 18 |
# === load local model
|
| 19 |
# MODEL_DIR = "model"
|
| 20 |
# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
|
| 21 |
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO
|
| 26 |
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
|
|
|
|
| 27 |
|
|
|
|
| 28 |
model.eval()
|
| 29 |
|
| 30 |
-
# ===
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
# === Download video from URL ===
|
| 35 |
def download_video(url, filename=TEMP_VIDEO):
|
| 36 |
-
import mimetypes
|
| 37 |
-
|
| 38 |
temp_download = "raw_download.mp4"
|
| 39 |
-
headers = {
|
| 40 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
try:
|
| 44 |
-
r = requests.get(url, headers=headers, stream=True, timeout=15)
|
| 45 |
-
r.raise_for_status()
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
raise RuntimeError(f"URL does not point to a video file. Content-Type: {content_type}")
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
f.write(chunk)
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
|
|
|
| 57 |
|
| 58 |
-
# Attempt to fix the file with ffmpeg
|
| 59 |
-
repaired_file = filename
|
| 60 |
ffmpeg_cmd = [
|
| 61 |
"ffmpeg", "-y", "-i", temp_download,
|
| 62 |
-
"-c", "copy", "-movflags", "+faststart",
|
| 63 |
]
|
| 64 |
result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 65 |
|
| 66 |
-
if result.returncode != 0 or not os.path.exists(
|
| 67 |
-
|
| 68 |
-
raise RuntimeError("FFmpeg failed to process the video. File may not be a valid MP4.")
|
| 69 |
|
| 70 |
os.remove(temp_download)
|
| 71 |
-
return
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
# === Extract audio from video ===
|
| 77 |
-
def extract_audio_from_video(video_path, output_path=RAW_AUDIO + ".mp4"):
|
| 78 |
-
clip = VideoFileClip(video_path)
|
| 79 |
-
if clip.audio is None:
|
| 80 |
-
raise ValueError("No audio stream found in video.")
|
| 81 |
-
clip.audio.write_audiofile(output_path)
|
| 82 |
-
return output_path
|
| 83 |
-
|
| 84 |
-
# === Convert any input audio to WAV using ffmpeg ===
|
| 85 |
-
def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
|
| 86 |
-
command = ["ffmpeg", "-y", "-i", input_path, output_path]
|
| 87 |
-
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 88 |
-
return output_path
|
| 89 |
-
|
| 90 |
-
# === Run accent classification ===
|
| 91 |
def classify_accent(audio_path):
|
| 92 |
waveform, sr = sf.read(audio_path)
|
| 93 |
-
|
| 94 |
if len(waveform.shape) > 1:
|
| 95 |
-
waveform = waveform.mean(axis=1)
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
num_samples = int(len(waveform) * target_sr / sr)
|
| 100 |
waveform = resample(waveform, num_samples)
|
| 101 |
-
sr =
|
| 102 |
-
|
| 103 |
-
inputs = feature_extractor(
|
| 104 |
-
waveform,
|
| 105 |
-
sampling_rate=sr,
|
| 106 |
-
return_tensors="pt",
|
| 107 |
-
padding=True
|
| 108 |
-
)
|
| 109 |
|
|
|
|
| 110 |
with torch.no_grad():
|
| 111 |
outputs = model(**inputs)
|
| 112 |
logits = outputs.logits[0]
|
|
@@ -121,52 +93,68 @@ def classify_accent(audio_path):
|
|
| 121 |
top5_scores = [round(p, 4) for p in top5.values.tolist()]
|
| 122 |
top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
|
| 123 |
|
| 124 |
-
return
|
| 125 |
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
# === Main
|
| 128 |
-
def process_input(
|
| 129 |
try:
|
| 130 |
audio_path = None
|
| 131 |
|
| 132 |
-
if
|
| 133 |
-
shutil.copy(
|
| 134 |
audio_path = convert_to_wav(RAW_AUDIO)
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
elif video_url and video_url.strip():
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
|
| 139 |
audio_path = convert_to_wav(extracted)
|
| 140 |
|
|
|
|
| 141 |
else:
|
| 142 |
-
return "Please provide a video
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
-
return
|
| 145 |
|
| 146 |
except Exception as e:
|
| 147 |
-
return f"Error: {str(e)}", None, None, None, None
|
| 148 |
|
| 149 |
finally:
|
| 150 |
for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
|
| 151 |
if os.path.exists(f):
|
| 152 |
os.remove(f)
|
| 153 |
|
| 154 |
-
# === Gradio
|
| 155 |
interface = gr.Interface(
|
| 156 |
fn=process_input,
|
| 157 |
inputs=[
|
| 158 |
-
gr.
|
| 159 |
-
gr.
|
|
|
|
| 160 |
],
|
| 161 |
outputs=[
|
| 162 |
gr.Text(label="Prediction"),
|
| 163 |
gr.Number(label="Confidence Score"),
|
| 164 |
gr.Text(label="Accent"),
|
| 165 |
gr.Audio(label="Processed Audio", type="filepath"),
|
| 166 |
-
gr.Text(label="Top 5 Predictions")
|
|
|
|
| 167 |
],
|
| 168 |
-
title="Accent Classifier",
|
| 169 |
-
description="Upload an audio file
|
| 170 |
)
|
| 171 |
|
| 172 |
if __name__ == "__main__":
|
|
|
|
| 7 |
import soundfile as sf
|
| 8 |
from scipy.signal import resample
|
| 9 |
from moviepy.editor import VideoFileClip, AudioFileClip
|
| 10 |
+
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
|
| 11 |
|
| 12 |
# === Constants ===
|
| 13 |
TEMP_VIDEO = "temp_video.mp4"
|
| 14 |
RAW_AUDIO = "raw_audio_input"
|
| 15 |
CONVERTED_AUDIO = "converted_audio.wav"
|
| 16 |
+
MODEL_REPO = "ylacombe/accent-classifier"
|
| 17 |
|
| 18 |
# === load local model
|
| 19 |
# MODEL_DIR = "model"
|
| 20 |
# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
|
| 21 |
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
|
| 22 |
|
| 23 |
+
|
| 24 |
+
# === Load models ===
|
| 25 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
|
| 26 |
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
|
| 27 |
+
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
| 28 |
|
| 29 |
+
LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
|
| 30 |
model.eval()
|
| 31 |
|
| 32 |
+
# === Helpers ===
|
| 33 |
+
def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
|
| 34 |
+
command = ["ffmpeg", "-y", "-i", input_path, output_path]
|
| 35 |
+
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 36 |
+
return output_path
|
| 37 |
|
| 38 |
+
def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
|
| 39 |
+
clip = VideoFileClip(video_path)
|
| 40 |
+
if clip.audio is None:
|
| 41 |
+
raise ValueError("No audio stream found in video.")
|
| 42 |
+
clip.audio.write_audiofile(output_path)
|
| 43 |
+
return output_path
|
| 44 |
|
|
|
|
| 45 |
def download_video(url, filename=TEMP_VIDEO):
|
|
|
|
|
|
|
| 46 |
temp_download = "raw_download.mp4"
|
| 47 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
r = requests.get(url, headers=headers, stream=True, timeout=15)
|
| 50 |
+
r.raise_for_status()
|
|
|
|
| 51 |
|
| 52 |
+
if not r.headers.get("Content-Type", "").startswith("video/"):
|
| 53 |
+
raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
|
|
|
|
| 54 |
|
| 55 |
+
with open(temp_download, 'wb') as f:
|
| 56 |
+
for chunk in r.iter_content(chunk_size=8192):
|
| 57 |
+
f.write(chunk)
|
| 58 |
|
|
|
|
|
|
|
| 59 |
ffmpeg_cmd = [
|
| 60 |
"ffmpeg", "-y", "-i", temp_download,
|
| 61 |
+
"-c", "copy", "-movflags", "+faststart", filename
|
| 62 |
]
|
| 63 |
result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 64 |
|
| 65 |
+
if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
|
| 66 |
+
raise RuntimeError("FFmpeg failed to process the video.")
|
|
|
|
| 67 |
|
| 68 |
os.remove(temp_download)
|
| 69 |
+
return filename
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
def classify_accent(audio_path):
|
| 72 |
waveform, sr = sf.read(audio_path)
|
|
|
|
| 73 |
if len(waveform.shape) > 1:
|
| 74 |
+
waveform = waveform.mean(axis=1)
|
| 75 |
|
| 76 |
+
if sr != 16000:
|
| 77 |
+
num_samples = int(len(waveform) * 16000 / sr)
|
|
|
|
| 78 |
waveform = resample(waveform, num_samples)
|
| 79 |
+
sr = 16000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
|
| 82 |
with torch.no_grad():
|
| 83 |
outputs = model(**inputs)
|
| 84 |
logits = outputs.logits[0]
|
|
|
|
| 93 |
top5_scores = [round(p, 4) for p in top5.values.tolist()]
|
| 94 |
top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
|
| 95 |
|
| 96 |
+
return top_label, top_conf, top5_text
|
| 97 |
|
| 98 |
+
def transcribe_audio(audio_path):
|
| 99 |
+
result = whisper(audio_path, return_timestamps=True)
|
| 100 |
+
return result.get("text", "").strip()
|
| 101 |
|
| 102 |
+
# === Main Handler ===
|
| 103 |
+
def process_input(audio_file, video_file, video_url):
|
| 104 |
try:
|
| 105 |
audio_path = None
|
| 106 |
|
| 107 |
+
if audio_file:
|
| 108 |
+
shutil.copy(audio_file, RAW_AUDIO)
|
| 109 |
audio_path = convert_to_wav(RAW_AUDIO)
|
| 110 |
|
| 111 |
+
elif video_file:
|
| 112 |
+
shutil.copy(video_file, TEMP_VIDEO)
|
| 113 |
+
extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
|
| 114 |
+
audio_path = convert_to_wav(extracted)
|
| 115 |
+
|
| 116 |
elif video_url and video_url.strip():
|
| 117 |
+
if "loom.com" in video_url:
|
| 118 |
+
return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
|
| 119 |
+
downloaded = download_video(video_url)
|
| 120 |
+
extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
|
| 121 |
audio_path = convert_to_wav(extracted)
|
| 122 |
|
| 123 |
+
|
| 124 |
else:
|
| 125 |
+
return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
|
| 126 |
+
|
| 127 |
+
label, confidence, top5 = classify_accent(audio_path)
|
| 128 |
+
transcription = transcribe_audio(audio_path)
|
| 129 |
|
| 130 |
+
return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
|
| 131 |
|
| 132 |
except Exception as e:
|
| 133 |
+
return f"Error: {str(e)}", None, None, None, None, None
|
| 134 |
|
| 135 |
finally:
|
| 136 |
for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
|
| 137 |
if os.path.exists(f):
|
| 138 |
os.remove(f)
|
| 139 |
|
| 140 |
+
# === Gradio Interface ===
|
| 141 |
interface = gr.Interface(
|
| 142 |
fn=process_input,
|
| 143 |
inputs=[
|
| 144 |
+
gr.Audio(label="Upload MP3 or WAV", type="filepath"),
|
| 145 |
+
gr.File(label="Upload MP4 Video", type="filepath"),
|
| 146 |
+
gr.Textbox(label="Paste Direct .mp4 Video URL")
|
| 147 |
],
|
| 148 |
outputs=[
|
| 149 |
gr.Text(label="Prediction"),
|
| 150 |
gr.Number(label="Confidence Score"),
|
| 151 |
gr.Text(label="Accent"),
|
| 152 |
gr.Audio(label="Processed Audio", type="filepath"),
|
| 153 |
+
gr.Text(label="Top 5 Predictions"),
|
| 154 |
+
gr.Text(label="Transcription")
|
| 155 |
],
|
| 156 |
+
title="Accent Classifier + Transcriber",
|
| 157 |
+
description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
|
| 158 |
)
|
| 159 |
|
| 160 |
if __name__ == "__main__":
|
local.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import torch
|
| 4 |
+
import shutil
|
| 5 |
+
import requests
|
| 6 |
+
import subprocess
|
| 7 |
+
import soundfile as sf
|
| 8 |
+
from scipy.signal import resample
|
| 9 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
| 10 |
+
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
|
| 11 |
+
|
| 12 |
+
# === Constants ===
|
| 13 |
+
TEMP_VIDEO = "temp_video.mp4"
|
| 14 |
+
RAW_AUDIO = "raw_audio_input"
|
| 15 |
+
CONVERTED_AUDIO = "converted_audio.wav"
|
| 16 |
+
MODEL_REPO = "ylacombe/accent-classifier"
|
| 17 |
+
|
| 18 |
+
# === load local model
|
| 19 |
+
MODEL_DIR = "model"
|
| 20 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
|
| 21 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# === Load models ===
|
| 25 |
+
# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
|
| 26 |
+
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
|
| 27 |
+
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
| 28 |
+
|
| 29 |
+
LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
|
| 30 |
+
model.eval()
|
| 31 |
+
|
| 32 |
+
# === Helpers ===
|
| 33 |
+
def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
|
| 34 |
+
command = ["ffmpeg", "-y", "-i", input_path, output_path]
|
| 35 |
+
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 36 |
+
return output_path
|
| 37 |
+
|
| 38 |
+
def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
|
| 39 |
+
clip = VideoFileClip(video_path)
|
| 40 |
+
if clip.audio is None:
|
| 41 |
+
raise ValueError("No audio stream found in video.")
|
| 42 |
+
clip.audio.write_audiofile(output_path)
|
| 43 |
+
return output_path
|
| 44 |
+
|
| 45 |
+
def download_video(url, filename=TEMP_VIDEO):
|
| 46 |
+
temp_download = "raw_download.mp4"
|
| 47 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
| 48 |
+
|
| 49 |
+
r = requests.get(url, headers=headers, stream=True, timeout=15)
|
| 50 |
+
r.raise_for_status()
|
| 51 |
+
|
| 52 |
+
if not r.headers.get("Content-Type", "").startswith("video/"):
|
| 53 |
+
raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
|
| 54 |
+
|
| 55 |
+
with open(temp_download, 'wb') as f:
|
| 56 |
+
for chunk in r.iter_content(chunk_size=8192):
|
| 57 |
+
f.write(chunk)
|
| 58 |
+
|
| 59 |
+
ffmpeg_cmd = [
|
| 60 |
+
"ffmpeg", "-y", "-i", temp_download,
|
| 61 |
+
"-c", "copy", "-movflags", "+faststart", filename
|
| 62 |
+
]
|
| 63 |
+
result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 64 |
+
|
| 65 |
+
if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
|
| 66 |
+
raise RuntimeError("FFmpeg failed to process the video.")
|
| 67 |
+
|
| 68 |
+
os.remove(temp_download)
|
| 69 |
+
return filename
|
| 70 |
+
|
| 71 |
+
def classify_accent(audio_path):
|
| 72 |
+
waveform, sr = sf.read(audio_path)
|
| 73 |
+
if len(waveform.shape) > 1:
|
| 74 |
+
waveform = waveform.mean(axis=1)
|
| 75 |
+
|
| 76 |
+
if sr != 16000:
|
| 77 |
+
num_samples = int(len(waveform) * 16000 / sr)
|
| 78 |
+
waveform = resample(waveform, num_samples)
|
| 79 |
+
sr = 16000
|
| 80 |
+
|
| 81 |
+
inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
|
| 82 |
+
with torch.no_grad():
|
| 83 |
+
outputs = model(**inputs)
|
| 84 |
+
logits = outputs.logits[0]
|
| 85 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)
|
| 86 |
+
|
| 87 |
+
top_idx = torch.argmax(probs).item()
|
| 88 |
+
top_label = LABELS[top_idx]
|
| 89 |
+
top_conf = round(probs[top_idx].item(), 4)
|
| 90 |
+
|
| 91 |
+
top5 = torch.topk(probs, k=5)
|
| 92 |
+
top5_labels = [LABELS[i] for i in top5.indices.tolist()]
|
| 93 |
+
top5_scores = [round(p, 4) for p in top5.values.tolist()]
|
| 94 |
+
top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
|
| 95 |
+
|
| 96 |
+
return top_label, top_conf, top5_text
|
| 97 |
+
|
| 98 |
+
def transcribe_audio(audio_path):
|
| 99 |
+
result = whisper(audio_path, return_timestamps=True)
|
| 100 |
+
return result.get("text", "").strip()
|
| 101 |
+
|
| 102 |
+
# === Main Handler ===
|
| 103 |
+
def process_input(audio_file, video_file, video_url):
|
| 104 |
+
try:
|
| 105 |
+
audio_path = None
|
| 106 |
+
|
| 107 |
+
if audio_file:
|
| 108 |
+
shutil.copy(audio_file, RAW_AUDIO)
|
| 109 |
+
audio_path = convert_to_wav(RAW_AUDIO)
|
| 110 |
+
|
| 111 |
+
elif video_file:
|
| 112 |
+
shutil.copy(video_file, TEMP_VIDEO)
|
| 113 |
+
extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
|
| 114 |
+
audio_path = convert_to_wav(extracted)
|
| 115 |
+
|
| 116 |
+
elif video_url and video_url.strip():
|
| 117 |
+
if "loom.com" in video_url:
|
| 118 |
+
return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
|
| 119 |
+
downloaded = download_video(video_url)
|
| 120 |
+
extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
|
| 121 |
+
audio_path = convert_to_wav(extracted)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
else:
|
| 125 |
+
return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
|
| 126 |
+
|
| 127 |
+
label, confidence, top5 = classify_accent(audio_path)
|
| 128 |
+
transcription = transcribe_audio(audio_path)
|
| 129 |
+
|
| 130 |
+
return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
return f"Error: {str(e)}", None, None, None, None, None
|
| 134 |
+
|
| 135 |
+
finally:
|
| 136 |
+
for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
|
| 137 |
+
if os.path.exists(f):
|
| 138 |
+
os.remove(f)
|
| 139 |
+
|
| 140 |
+
# === Gradio Interface ===
|
| 141 |
+
interface = gr.Interface(
|
| 142 |
+
fn=process_input,
|
| 143 |
+
inputs=[
|
| 144 |
+
gr.Audio(label="Upload MP3 or WAV", type="filepath"),
|
| 145 |
+
gr.File(label="Upload MP4 Video", type="filepath"),
|
| 146 |
+
gr.Textbox(label="Paste Direct .mp4 Video URL")
|
| 147 |
+
],
|
| 148 |
+
outputs=[
|
| 149 |
+
gr.Text(label="Prediction"),
|
| 150 |
+
gr.Number(label="Confidence Score"),
|
| 151 |
+
gr.Text(label="Accent"),
|
| 152 |
+
gr.Audio(label="Processed Audio", type="filepath"),
|
| 153 |
+
gr.Text(label="Top 5 Predictions"),
|
| 154 |
+
gr.Text(label="Transcription")
|
| 155 |
+
],
|
| 156 |
+
title="Accent Classifier + Transcriber",
|
| 157 |
+
description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
interface.launch()
|