Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -279,87 +279,42 @@ def transcribe_video_with_speakers(video_path):
|
|
279 |
|
280 |
return transcript_with_speakers, detected_language
|
281 |
|
282 |
-
def segment_audio_from_video(video_path
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
video_path (str): The path to the input video file.
|
289 |
-
frame_duration_ms (int): The duration of a frame in milliseconds for VAD (10, 20, or 30).
|
290 |
-
Lower values are more precise but computationally intensive.
|
291 |
-
|
292 |
-
Returns:
|
293 |
-
tuple: A tuple containing:
|
294 |
-
- audio_path (str): Path to the extracted temporary audio file.
|
295 |
-
- speech_segments (list): A list of dictionaries, where each dictionary
|
296 |
-
represents a speech segment with 'start' and 'end' timestamps in seconds.
|
297 |
-
- error_message (str, optional): An error message if processing fails.
|
298 |
-
"""
|
299 |
-
audio_path = "temp_extracted_audio.wav"
|
300 |
-
speech_segments = []
|
301 |
-
error_message = None
|
302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
try:
|
304 |
-
#
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
#
|
309 |
-
|
310 |
-
|
311 |
-
logger.info(f"Audio extracted to: {audio_path}")
|
312 |
-
|
313 |
-
# 2. Load audio for VAD
|
314 |
-
audio = AudioSegment.from_wav(audio_path)
|
315 |
-
sample_rate = audio.frame_rate
|
316 |
-
audio_data = np.array(audio.get_array_of_samples())
|
317 |
-
|
318 |
-
# WebRTC VAD operates on 16-bit mono audio at 8kHz, 16kHz, or 32kHz.
|
319 |
-
# We already saved at 16kHz, so we can proceed.
|
320 |
-
if sample_rate not in [8000, 16000, 32000]:
|
321 |
-
error_message = f"Unsupported sample rate for VAD: {sample_rate} Hz. Must be 8kHz, 16kHz, or 32kHz."
|
322 |
-
logger.error(error_message)
|
323 |
-
return audio_path, [], error_message
|
324 |
-
|
325 |
-
vad = webrtcvad.Vad(3) # Aggressiveness mode (0-3, 3 is most aggressive)
|
326 |
-
frames = []
|
327 |
-
offset = 0
|
328 |
-
while offset + frame_duration_ms <= len(audio):
|
329 |
-
frame_start = offset
|
330 |
-
frame_end = offset + frame_duration_ms
|
331 |
-
frame = audio[frame_start:frame_end]
|
332 |
-
frames.append(frame)
|
333 |
-
offset += frame_duration_ms
|
334 |
-
|
335 |
-
logger.info(f"Running WebRTC VAD on {len(frames)} frames...")
|
336 |
-
|
337 |
-
current_segment_start = None
|
338 |
-
for i, frame in enumerate(frames):
|
339 |
-
is_speech = vad.is_speech(frame.raw_data, sample_rate)
|
340 |
-
|
341 |
-
frame_start_time = (i * frame_duration_ms) / 1000.0
|
342 |
-
frame_end_time = ((i + 1) * frame_duration_ms) / 1000.0
|
343 |
|
344 |
-
|
345 |
-
|
346 |
-
current_segment_start = frame_start_time
|
347 |
-
else:
|
348 |
-
if current_segment_start is not None:
|
349 |
-
speech_segments.append({"start": current_segment_start, "end": frame_end_time})
|
350 |
-
current_segment_start = None
|
351 |
-
|
352 |
-
# Add the last segment if it ended with speech
|
353 |
-
if current_segment_start is not None:
|
354 |
-
speech_segments.append({"start": current_segment_start, "end": len(audio) / 1000.0})
|
355 |
|
356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
|
358 |
-
|
359 |
-
error_message = f"An error occurred during audio segmentation: {e}"
|
360 |
-
logger.error(error_message)
|
361 |
-
|
362 |
-
return audio_path, speech_segments, error_message
|
363 |
|
364 |
def transcribe_segments_with_scribe(full_audio_path, segments):
|
365 |
"""
|
@@ -1373,9 +1328,7 @@ def upload_and_manage(file, target_language, process_mode):
|
|
1373 |
|
1374 |
# Step 1: Segment audio from the uploaded video/audio file
|
1375 |
logger.info("Segmenting audio...")
|
1376 |
-
temp_audio_for_vad, speech_segments
|
1377 |
-
if seg_error:
|
1378 |
-
raise Exception(f"Audio segmentation failed: {seg_error}")
|
1379 |
if not speech_segments:
|
1380 |
raise Exception("No speech segments detected in the audio.")
|
1381 |
logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")
|
|
|
279 |
|
280 |
return transcript_with_speakers, detected_language
|
281 |
|
282 |
+
def segment_audio_from_video(video_path):
|
283 |
+
# Extract audio from video
|
284 |
+
video = VideoFileClip(video_path)
|
285 |
+
audio_path = "audio.wav"
|
286 |
+
video.audio.write_audiofile(audio_path)
|
287 |
+
logger.info(f"Audio extracted from video: {audio_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
|
289 |
+
segment_result, speech_audio_path = segment_background_audio(audio_path)
|
290 |
+
print(f"Saved non-speech (background) audio to local")
|
291 |
+
|
292 |
+
# Set up device
|
293 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
294 |
+
logger.info(f"Using device: {device}")
|
295 |
+
|
296 |
try:
|
297 |
+
# Load a medium model with float32 for broader compatibility
|
298 |
+
model = whisperx.load_model("large-v3", device=device, compute_type="float32")
|
299 |
+
logger.info("WhisperX model loaded")
|
300 |
+
|
301 |
+
# Transcribe
|
302 |
+
result = model.transcribe(speech_audio_path, chunk_size=4, print_progress = True)
|
303 |
+
logger.info("Audio transcription completed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
|
305 |
+
except Exception as e:
|
306 |
+
logger.error(f"❌ WhisperX pipeline failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
|
308 |
+
# Extract timestamps, text, and speaker IDs
|
309 |
+
transcript_with_speakers = [
|
310 |
+
{
|
311 |
+
"start": segment["start"],
|
312 |
+
"end": segment["end"]
|
313 |
+
}
|
314 |
+
for segment in result["segments"]
|
315 |
+
]
|
316 |
|
317 |
+
return audio_path, transcript_with_speakers
|
|
|
|
|
|
|
|
|
318 |
|
319 |
def transcribe_segments_with_scribe(full_audio_path, segments):
|
320 |
"""
|
|
|
1328 |
|
1329 |
# Step 1: Segment audio from the uploaded video/audio file
|
1330 |
logger.info("Segmenting audio...")
|
1331 |
+
temp_audio_for_vad, speech_segments = segment_audio_from_video(file.name)
|
|
|
|
|
1332 |
if not speech_segments:
|
1333 |
raise Exception("No speech segments detected in the audio.")
|
1334 |
logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")
|