usamaijaz-ai commited on
Commit
2233480
·
1 Parent(s): 3a2bf29
Files changed (2) hide show
  1. app.py +1 -5
  2. local.py +161 -0
app.py CHANGED
@@ -126,12 +126,8 @@ def process_input(audio_file, video_file, video_url):
126
 
127
  label, confidence, top5 = classify_accent(audio_path)
128
  transcription = transcribe_audio(audio_path)
129
- with open(audio_path, "rb") as f:
130
- audio_bytes = f.read()
131
 
132
- return f"Top prediction: {label}", confidence, label, (audio_bytes, "converted_audio.wav"), top5, transcription
133
-
134
- # return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
135
 
136
  except Exception as e:
137
  return f"Error: {str(e)}", None, None, None, None, None
 
126
 
127
  label, confidence, top5 = classify_accent(audio_path)
128
  transcription = transcribe_audio(audio_path)
 
 
129
 
130
+ return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
 
 
131
 
132
  except Exception as e:
133
  return f"Error: {str(e)}", None, None, None, None, None
local.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import torch
4
+ import shutil
5
+ import requests
6
+ import subprocess
7
+ import soundfile as sf
8
+ from scipy.signal import resample
9
+ from moviepy.editor import VideoFileClip, AudioFileClip
10
+ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
11
+
12
+ # === Constants ===
13
+ TEMP_VIDEO = "temp_video.mp4"
14
+ RAW_AUDIO = "raw_audio_input"
15
+ CONVERTED_AUDIO = "converted_audio.wav"
16
+ MODEL_REPO = "ylacombe/accent-classifier"
17
+
18
+ # === load local model
19
+ MODEL_DIR = "model"
20
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
21
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
22
+
23
+
24
+ # # === Load models ===
25
+ # model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
26
+ # feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
27
+ whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
28
+
29
+ LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
30
+ model.eval()
31
+
32
+ # === Helpers ===
33
+ def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
34
+ command = ["ffmpeg", "-y", "-i", input_path, output_path]
35
+ subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
36
+ return output_path
37
+
38
+ def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
39
+ clip = VideoFileClip(video_path)
40
+ if clip.audio is None:
41
+ raise ValueError("No audio stream found in video.")
42
+ clip.audio.write_audiofile(output_path)
43
+ return output_path
44
+
45
+ def download_video(url, filename=TEMP_VIDEO):
46
+ temp_download = "raw_download.mp4"
47
+ headers = {"User-Agent": "Mozilla/5.0"}
48
+
49
+ r = requests.get(url, headers=headers, stream=True, timeout=15)
50
+ r.raise_for_status()
51
+
52
+ if not r.headers.get("Content-Type", "").startswith("video/"):
53
+ raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
54
+
55
+ with open(temp_download, 'wb') as f:
56
+ for chunk in r.iter_content(chunk_size=8192):
57
+ f.write(chunk)
58
+
59
+ ffmpeg_cmd = [
60
+ "ffmpeg", "-y", "-i", temp_download,
61
+ "-c", "copy", "-movflags", "+faststart", filename
62
+ ]
63
+ result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
64
+
65
+ if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
66
+ raise RuntimeError("FFmpeg failed to process the video.")
67
+
68
+ os.remove(temp_download)
69
+ return filename
70
+
71
+ def classify_accent(audio_path):
72
+ waveform, sr = sf.read(audio_path)
73
+ if len(waveform.shape) > 1:
74
+ waveform = waveform.mean(axis=1)
75
+
76
+ if sr != 16000:
77
+ num_samples = int(len(waveform) * 16000 / sr)
78
+ waveform = resample(waveform, num_samples)
79
+ sr = 16000
80
+
81
+ inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
82
+ with torch.no_grad():
83
+ outputs = model(**inputs)
84
+ logits = outputs.logits[0]
85
+ probs = torch.nn.functional.softmax(logits, dim=-1)
86
+
87
+ top_idx = torch.argmax(probs).item()
88
+ top_label = LABELS[top_idx]
89
+ top_conf = round(probs[top_idx].item(), 4)
90
+
91
+ top5 = torch.topk(probs, k=5)
92
+ top5_labels = [LABELS[i] for i in top5.indices.tolist()]
93
+ top5_scores = [round(p, 4) for p in top5.values.tolist()]
94
+ top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
95
+
96
+ return top_label, top_conf, top5_text
97
+
98
+ def transcribe_audio(audio_path):
99
+ result = whisper(audio_path, return_timestamps=True)
100
+ return result.get("text", "").strip()
101
+
102
+ # === Main Handler ===
103
+ def process_input(audio_file, video_file, video_url):
104
+ try:
105
+ audio_path = None
106
+
107
+ if audio_file:
108
+ shutil.copy(audio_file, RAW_AUDIO)
109
+ audio_path = convert_to_wav(RAW_AUDIO)
110
+
111
+ elif video_file:
112
+ shutil.copy(video_file, TEMP_VIDEO)
113
+ extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
114
+ audio_path = convert_to_wav(extracted)
115
+
116
+ elif video_url and video_url.strip():
117
+ if "loom.com" in video_url:
118
+ return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
119
+ downloaded = download_video(video_url)
120
+ extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
121
+ audio_path = convert_to_wav(extracted)
122
+
123
+
124
+ else:
125
+ return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
126
+
127
+ label, confidence, top5 = classify_accent(audio_path)
128
+ transcription = transcribe_audio(audio_path)
129
+
130
+ return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
131
+
132
+ except Exception as e:
133
+ return f"Error: {str(e)}", None, None, None, None, None
134
+
135
+ finally:
136
+ for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
137
+ if os.path.exists(f):
138
+ os.remove(f)
139
+
140
+ # === Gradio Interface ===
141
+ interface = gr.Interface(
142
+ fn=process_input,
143
+ inputs=[
144
+ gr.Audio(label="Upload MP3 or WAV", type="filepath"),
145
+ gr.File(label="Upload MP4 Video", type="filepath"),
146
+ gr.Textbox(label="Paste Direct .mp4 Video URL")
147
+ ],
148
+ outputs=[
149
+ gr.Text(label="Prediction"),
150
+ gr.Number(label="Confidence Score"),
151
+ gr.Text(label="Accent"),
152
+ gr.Audio(label="Processed Audio", type="filepath"),
153
+ gr.Text(label="Top 5 Predictions"),
154
+ gr.Text(label="Transcription")
155
+ ],
156
+ title="Accent Classifier + Transcriber",
157
+ description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
158
+ )
159
+
160
+ if __name__ == "__main__":
161
+ interface.launch()