qqwjq1981 commited on
Commit
b0eabd6
·
verified ·
1 Parent(s): 4a5a1f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -79
app.py CHANGED
@@ -288,53 +288,41 @@ def segment_audio_from_video(video_path):
288
 
289
  segment_result, speech_audio_path = segment_background_audio(audio_path)
290
  print(f"Saved non-speech (background) audio to local")
291
-
292
- # Set up device
293
  device = "cuda" if torch.cuda.is_available() else "cpu"
294
  logger.info(f"Using device: {device}")
295
-
296
  try:
297
- # Load a medium model with float32 for broader compatibility
298
  model = whisperx.load_model("large-v3", device=device, compute_type="float32")
299
  logger.info("WhisperX model loaded")
300
-
301
- # Transcribe
302
- result = model.transcribe(speech_audio_path, chunk_size=4, print_progress = True)
303
  logger.info("Audio transcription completed")
304
-
305
  except Exception as e:
306
  logger.error(f"❌ WhisperX pipeline failed: {e}")
 
307
 
308
- # Extract timestamps, text, and speaker IDs
309
  transcript_with_speakers = [
310
  {
311
  "start": segment["start"],
312
  "end": segment["end"]
313
  }
314
  for segment in result["segments"]
 
315
  ]
316
 
317
  return audio_path, transcript_with_speakers
318
 
319
- def transcribe_segments_with_scribe(full_audio_path, segments):
320
- """
321
- Transcribes pre-defined audio segments using the ElevenLabs Scribe API.
322
- Diarization is explicitly turned off as per requirements.
 
 
323
 
324
- Args:
325
- full_audio_path (str): The path to the full extracted audio file.
326
- segments (list): A list of dictionaries, where each dictionary
327
- represents a segment with 'start' and 'end' timestamps in seconds.
328
-
329
- Returns:
330
- tuple: A tuple containing:
331
- - transcribed_segments (list): A list of dictionaries, where each dictionary
332
- represents a transcribed segment with 'start', 'end', and 'text'.
333
- - detected_language (str): The language detected by the API (e.g., "en", "es").
334
- - error_message (str, optional): An error message if transcription fails.
335
- """
336
  transcribed_segments = []
337
- detected_language = "unknown" # Default
338
  error_message = None
339
 
340
  if not os.path.exists(full_audio_path):
@@ -342,94 +330,63 @@ def transcribe_segments_with_scribe(full_audio_path, segments):
342
 
343
  try:
344
  audio_clip = AudioFileClip(full_audio_path)
345
-
346
- headers = {
347
- "xi-api-key": ELEVENLABS_API_KEY,
348
- }
349
- data = {
350
- "model_id": "scribe_v1",
351
- }
352
- # Explicitly set diarize to false, as it's not needed.
353
- params = {
354
- "diarize": "false",
355
- }
356
 
357
  logger.info(f"Starting transcription of {len(segments)} segments with ElevenLabs Scribe...")
358
 
359
  for i, segment in enumerate(segments):
360
- segment_start = segment["start"]
361
- segment_end = segment["end"]
362
-
363
- # Ensure segment duration is positive
364
- if segment_end <= segment_start:
365
- logger.warning(f"Skipping segment {i} due to invalid duration: {segment_start:.2f}s -> {segment_end:.2f}s")
366
  continue
367
 
368
  temp_segment_audio_path = f"temp_segment_{i}.wav"
369
  try:
370
- # Subclip the audio and save it temporarily
371
- sub_clip = audio_clip.subclip(segment_start, segment_end)
372
- # Save as 16-bit PCM WAV for Scribe API compatibility
373
  sub_clip.write_audiofile(temp_segment_audio_path, codec='pcm_s16le')
374
-
375
- logger.info(f"Transcribing segment {i+1}/{len(segments)}: {segment_start:.2f}s - {segment_end:.2f}s")
376
 
377
  with open(temp_segment_audio_path, "rb") as audio_file:
378
- files = {
379
- "file": (os.path.basename(temp_segment_audio_path), audio_file, "audio/wav")
380
- }
381
  response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data, params=params)
382
  response.raise_for_status()
383
  scribe_result = response.json()
384
 
385
- segment_text = ""
386
- if "text" in scribe_result:
387
- segment_text = scribe_result["text"].strip()
388
- elif "words" in scribe_result and scribe_result["words"]:
389
- # Fallback if 'text' field is not directly available, reconstruct from words
390
- segment_text = " ".join([w.get("text", "") for w in scribe_result["words"] if w.get("type") == "word"]).strip()
391
-
392
- if segment_text:
393
  transcribed_segments.append({
394
- "start": segment_start,
395
- "end": segment_end,
396
- "text": segment_text,
397
  "speaker": "SPEAKER_00"
398
  })
399
  else:
400
- logger.warning(f"No transcription text found for segment {i+1}.")
401
 
402
- # Update detected language from the first successful transcription
403
  if "language_code" in scribe_result and detected_language == "unknown":
404
  detected_language = scribe_result["language_code"]
405
 
406
- except requests.exceptions.HTTPError as http_err:
407
- error_message = f"HTTP error for segment {i+1}: {http_err} - {response.text}"
408
- logger.error(error_message)
409
- # Continue to next segment even if one fails
410
- except requests.exceptions.RequestException as req_err:
411
- error_message = f"Request error for segment {i+1}: {req_err}"
412
- logger.error(error_message)
413
- # Continue to next segment
414
  except Exception as e:
415
- error_message = f"Error processing segment {i+1}: {e}"
416
- logger.error(error_message)
417
- # Continue to next segment
418
  finally:
419
  if os.path.exists(temp_segment_audio_path):
420
  os.remove(temp_segment_audio_path)
421
-
422
  logger.info("All segments processed by ElevenLabs Scribe.")
423
 
424
  except Exception as e:
425
- error_message = f"An error occurred during overall transcription process: {e}"
426
  logger.error(error_message)
427
  finally:
428
- if 'audio_clip' in locals() and audio_clip is not None:
429
  audio_clip.close()
430
 
431
  return transcribed_segments, detected_language, error_message
432
-
433
 
434
  # Function to get the appropriate translation model based on target language
435
  def get_translation_model(source_language, target_language):
 
288
 
289
  segment_result, speech_audio_path = segment_background_audio(audio_path)
290
  print(f"Saved non-speech (background) audio to local")
291
+
 
292
  device = "cuda" if torch.cuda.is_available() else "cpu"
293
  logger.info(f"Using device: {device}")
294
+
295
  try:
 
296
  model = whisperx.load_model("large-v3", device=device, compute_type="float32")
297
  logger.info("WhisperX model loaded")
298
+ result = model.transcribe(speech_audio_path, chunk_size=4, print_progress=True)
 
 
299
  logger.info("Audio transcription completed")
 
300
  except Exception as e:
301
  logger.error(f"❌ WhisperX pipeline failed: {e}")
302
+ return audio_path, []
303
 
304
+ # Return segment boundaries (only timestamps, not text)
305
  transcript_with_speakers = [
306
  {
307
  "start": segment["start"],
308
  "end": segment["end"]
309
  }
310
  for segment in result["segments"]
311
+ if segment["end"] > segment["start"]
312
  ]
313
 
314
  return audio_path, transcript_with_speakers
315
 
316
+ def clean_transcribed_text(text: str) -> str:
317
+ """Remove repetitive symbols and artifacts from text."""
318
+ # Remove only-punctuation or repeated tokens
319
+ cleaned = re.sub(r"[_,.~`^•·。!?!?,,\.\/\\\-–—=+]+", " ", text)
320
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
321
+ return cleaned
322
 
323
+ def transcribe_segments_with_scribe(full_audio_path, segments):
 
 
 
 
 
 
 
 
 
 
 
324
  transcribed_segments = []
325
+ detected_language = "unknown"
326
  error_message = None
327
 
328
  if not os.path.exists(full_audio_path):
 
330
 
331
  try:
332
  audio_clip = AudioFileClip(full_audio_path)
333
+ headers = {"xi-api-key": ELEVENLABS_API_KEY}
334
+ data = {"model_id": "scribe_v1"}
335
+ params = {"diarize": "false"}
 
 
 
 
 
 
 
 
336
 
337
  logger.info(f"Starting transcription of {len(segments)} segments with ElevenLabs Scribe...")
338
 
339
  for i, segment in enumerate(segments):
340
+ start, end = segment["start"], segment["end"]
341
+ if end <= start:
342
+ logger.warning(f"Skipping invalid segment {i}: {start:.2f}s → {end:.2f}s")
 
 
 
343
  continue
344
 
345
  temp_segment_audio_path = f"temp_segment_{i}.wav"
346
  try:
347
+ sub_clip = audio_clip.subclip(start, end)
 
 
348
  sub_clip.write_audiofile(temp_segment_audio_path, codec='pcm_s16le')
 
 
349
 
350
  with open(temp_segment_audio_path, "rb") as audio_file:
351
+ files = {"file": (os.path.basename(temp_segment_audio_path), audio_file, "audio/wav")}
 
 
352
  response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data, params=params)
353
  response.raise_for_status()
354
  scribe_result = response.json()
355
 
356
+ raw_text = scribe_result.get("text") or " ".join(
357
+ [w.get("text", "") for w in scribe_result.get("words", []) if w.get("type") == "word"]
358
+ )
359
+
360
+ cleaned_text = clean_transcribed_text(raw_text)
361
+ if cleaned_text:
 
 
362
  transcribed_segments.append({
363
+ "start": start,
364
+ "end": end,
365
+ "text": cleaned_text,
366
  "speaker": "SPEAKER_00"
367
  })
368
  else:
369
+ logger.info(f"Segment {i+1} discarded: cleaned text is empty.")
370
 
 
371
  if "language_code" in scribe_result and detected_language == "unknown":
372
  detected_language = scribe_result["language_code"]
373
 
 
 
 
 
 
 
 
 
374
  except Exception as e:
375
+ logger.error(f"Error processing segment {i+1}: {e}")
 
 
376
  finally:
377
  if os.path.exists(temp_segment_audio_path):
378
  os.remove(temp_segment_audio_path)
379
+
380
  logger.info("All segments processed by ElevenLabs Scribe.")
381
 
382
  except Exception as e:
383
+ error_message = f"An error occurred: {e}"
384
  logger.error(error_message)
385
  finally:
386
+ if 'audio_clip' in locals():
387
  audio_clip.close()
388
 
389
  return transcribed_segments, detected_language, error_message
 
390
 
391
  # Function to get the appropriate translation model based on target language
392
  def get_translation_model(source_language, target_language):