qqwjq1981 commited on
Commit
fea34c6
·
verified ·
1 Parent(s): 9870a55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -0
app.py CHANGED
@@ -313,6 +313,7 @@ def segment_audio_from_video(video_path):
313
  ]
314
 
315
  return audio_path, transcript_with_speakers
 
316
  def clean_transcribed_text(text: str) -> str:
317
  """
318
  Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
@@ -390,6 +391,104 @@ def transcribe_segments_with_scribe(full_audio_path, segments):
390
 
391
  return transcribed_segments, detected_language, error_message
392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  # Function to get the appropriate translation model based on target language
394
  def get_translation_model(source_language, target_language):
395
  """
 
313
  ]
314
 
315
  return audio_path, transcript_with_speakers
316
+
317
  def clean_transcribed_text(text: str) -> str:
318
  """
319
  Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
 
391
 
392
  return transcribed_segments, detected_language, error_message
393
 
394
+ from collections import Counter
395
+
396
+ def process_scribe_output(scribe_response, max_line_length=50):
397
+ """
398
+ Processes the Scribe API response to clean the text and generate line-level timestamps.
399
+
400
+ Args:
401
+ scribe_response (dict): The raw response dictionary from the Scribe API.
402
+ max_line_length (int): The maximum number of characters desired per line before
403
+ a new line is created. This is an approximate guide.
404
+
405
+ Returns:
406
+ list: A list of dictionaries, where each dictionary represents a line
407
+ and contains 'text', 'start_time', 'end_time', and 'speaker_id'.
408
+ """
409
+ cleaned_words = []
410
+ for word_info in scribe_response['words']:
411
+ text = word_info['text']
412
+ start = word_info['start']
413
+ end = word_info['end']
414
+ word_type = word_info['type']
415
+ speaker_id = word_info.get('speaker_id', None)
416
+
417
+ if word_type == 'audio_event':
418
+ continue # Remove audio event tags like [背景音]
419
+ elif word_type == 'spacing':
420
+ if cleaned_words and cleaned_words[-1]['text'].endswith(' '):
421
+ continue
422
+ text = ' '
423
+
424
+ cleaned_words.append({
425
+ 'text': text,
426
+ 'start': start,
427
+ 'end': end,
428
+ 'speaker_id': speaker_id
429
+ })
430
+
431
+ lines = []
432
+ current_line_words = []
433
+ current_line_start_time = None
434
+
435
+ for i, word_info in enumerate(cleaned_words):
436
+ if not current_line_words:
437
+ current_line_start_time = word_info['start']
438
+
439
+ current_line_words.append(word_info)
440
+
441
+ current_line_text = "".join([w['text'] for w in current_line_words]).strip()
442
+
443
+ line_should_end = (
444
+ len(current_line_text) >= max_line_length or
445
+ i == len(cleaned_words) - 1 or
446
+ word_info['text'].endswith(('。', '?', '!'))
447
+ )
448
+
449
+ if line_should_end:
450
+ line_text = current_line_text
451
+ line_end_time = word_info['end']
452
+
453
+ # Majority speaker_id in this line
454
+ speaker_ids = [w['speaker_id'] for w in current_line_words if w['speaker_id'] is not None]
455
+ speaker_id = Counter(speaker_ids).most_common(1)[0][0] if speaker_ids else None
456
+
457
+ lines.append({
458
+ 'original': line_text,
459
+ 'start': current_line_start_time,
460
+ 'end': line_end_time,
461
+ 'speaker': speaker_id
462
+ })
463
+
464
+ current_line_words = []
465
+ current_line_start_time = None
466
+
467
+ return lines
468
+
469
+ def transcribe_with_scribe(full_audio_path):
470
+ transcribed_segments = []
471
+ detected_language = "unknown"
472
+ error_message = None
473
+
474
+ if not os.path.exists(full_audio_path):
475
+ return [], detected_language, f"Full audio file not found at {full_audio_path}"
476
+
477
+ headers = {"xi-api-key": ELEVENLABS_API_KEY}
478
+ data = {
479
+ "model_id": "scribe_v1",
480
+ "diarize": "true"
481
+ }
482
+
483
+ logger.info(f"Starting transcription for full audio: {full_audio_path}")
484
+
485
+ with open(full_audio_path, "rb") as audio_file:
486
+ files = {"file": (os.path.basename(full_audio_path), audio_file, "audio/wav")}
487
+ response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data)
488
+ response.raise_for_status()
489
+ scribe_result = response.json()
490
+ return scribe_result
491
+
492
  # Function to get the appropriate translation model based on target language
493
  def get_translation_model(source_language, target_language):
494
  """