Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -313,6 +313,7 @@ def segment_audio_from_video(video_path):
|
|
313 |
]
|
314 |
|
315 |
return audio_path, transcript_with_speakers
|
|
|
316 |
def clean_transcribed_text(text: str) -> str:
|
317 |
"""
|
318 |
Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
|
@@ -390,6 +391,104 @@ def transcribe_segments_with_scribe(full_audio_path, segments):
|
|
390 |
|
391 |
return transcribed_segments, detected_language, error_message
|
392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
# Function to get the appropriate translation model based on target language
|
394 |
def get_translation_model(source_language, target_language):
|
395 |
"""
|
|
|
313 |
]
|
314 |
|
315 |
return audio_path, transcript_with_speakers
|
316 |
+
|
317 |
def clean_transcribed_text(text: str) -> str:
|
318 |
"""
|
319 |
Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
|
|
|
391 |
|
392 |
return transcribed_segments, detected_language, error_message
|
393 |
|
394 |
+
from collections import Counter
|
395 |
+
|
396 |
+
def process_scribe_output(scribe_response, max_line_length=50):
|
397 |
+
"""
|
398 |
+
Processes the Scribe API response to clean the text and generate line-level timestamps.
|
399 |
+
|
400 |
+
Args:
|
401 |
+
scribe_response (dict): The raw response dictionary from the Scribe API.
|
402 |
+
max_line_length (int): The maximum number of characters desired per line before
|
403 |
+
a new line is created. This is an approximate guide.
|
404 |
+
|
405 |
+
Returns:
|
406 |
+
list: A list of dictionaries, where each dictionary represents a line
|
407 |
+
and contains 'text', 'start_time', 'end_time', and 'speaker_id'.
|
408 |
+
"""
|
409 |
+
cleaned_words = []
|
410 |
+
for word_info in scribe_response['words']:
|
411 |
+
text = word_info['text']
|
412 |
+
start = word_info['start']
|
413 |
+
end = word_info['end']
|
414 |
+
word_type = word_info['type']
|
415 |
+
speaker_id = word_info.get('speaker_id', None)
|
416 |
+
|
417 |
+
if word_type == 'audio_event':
|
418 |
+
continue # Remove audio event tags like [背景音]
|
419 |
+
elif word_type == 'spacing':
|
420 |
+
if cleaned_words and cleaned_words[-1]['text'].endswith(' '):
|
421 |
+
continue
|
422 |
+
text = ' '
|
423 |
+
|
424 |
+
cleaned_words.append({
|
425 |
+
'text': text,
|
426 |
+
'start': start,
|
427 |
+
'end': end,
|
428 |
+
'speaker_id': speaker_id
|
429 |
+
})
|
430 |
+
|
431 |
+
lines = []
|
432 |
+
current_line_words = []
|
433 |
+
current_line_start_time = None
|
434 |
+
|
435 |
+
for i, word_info in enumerate(cleaned_words):
|
436 |
+
if not current_line_words:
|
437 |
+
current_line_start_time = word_info['start']
|
438 |
+
|
439 |
+
current_line_words.append(word_info)
|
440 |
+
|
441 |
+
current_line_text = "".join([w['text'] for w in current_line_words]).strip()
|
442 |
+
|
443 |
+
line_should_end = (
|
444 |
+
len(current_line_text) >= max_line_length or
|
445 |
+
i == len(cleaned_words) - 1 or
|
446 |
+
word_info['text'].endswith(('。', '?', '!'))
|
447 |
+
)
|
448 |
+
|
449 |
+
if line_should_end:
|
450 |
+
line_text = current_line_text
|
451 |
+
line_end_time = word_info['end']
|
452 |
+
|
453 |
+
# Majority speaker_id in this line
|
454 |
+
speaker_ids = [w['speaker_id'] for w in current_line_words if w['speaker_id'] is not None]
|
455 |
+
speaker_id = Counter(speaker_ids).most_common(1)[0][0] if speaker_ids else None
|
456 |
+
|
457 |
+
lines.append({
|
458 |
+
'original': line_text,
|
459 |
+
'start': current_line_start_time,
|
460 |
+
'end': line_end_time,
|
461 |
+
'speaker': speaker_id
|
462 |
+
})
|
463 |
+
|
464 |
+
current_line_words = []
|
465 |
+
current_line_start_time = None
|
466 |
+
|
467 |
+
return lines
|
468 |
+
|
469 |
+
def transcribe_with_scribe(full_audio_path):
|
470 |
+
transcribed_segments = []
|
471 |
+
detected_language = "unknown"
|
472 |
+
error_message = None
|
473 |
+
|
474 |
+
if not os.path.exists(full_audio_path):
|
475 |
+
return [], detected_language, f"Full audio file not found at {full_audio_path}"
|
476 |
+
|
477 |
+
headers = {"xi-api-key": ELEVENLABS_API_KEY}
|
478 |
+
data = {
|
479 |
+
"model_id": "scribe_v1",
|
480 |
+
"diarize": "true"
|
481 |
+
}
|
482 |
+
|
483 |
+
logger.info(f"Starting transcription for full audio: {full_audio_path}")
|
484 |
+
|
485 |
+
with open(full_audio_path, "rb") as audio_file:
|
486 |
+
files = {"file": (os.path.basename(full_audio_path), audio_file, "audio/wav")}
|
487 |
+
response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data)
|
488 |
+
response.raise_for_status()
|
489 |
+
scribe_result = response.json()
|
490 |
+
return scribe_result
|
491 |
+
|
492 |
# Function to get the appropriate translation model based on target language
|
493 |
def get_translation_model(source_language, target_language):
|
494 |
"""
|