Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -209,7 +209,7 @@ def transcribe_video_with_speakers(video_path):
|
|
| 209 |
"start": segment["start"],
|
| 210 |
"end": segment["end"],
|
| 211 |
"text": segment["text"],
|
| 212 |
-
"speaker": segment
|
| 213 |
}
|
| 214 |
for segment in result["segments"]
|
| 215 |
]
|
|
@@ -541,6 +541,7 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
|
|
| 541 |
cap.release()
|
| 542 |
|
| 543 |
ocr_results = []
|
|
|
|
| 544 |
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
| 545 |
futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
|
| 546 |
|
|
@@ -550,8 +551,9 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
|
|
| 550 |
if result["text"]:
|
| 551 |
ocr_results.append(result)
|
| 552 |
except Exception as e:
|
| 553 |
-
|
| 554 |
|
|
|
|
| 555 |
return ocr_results
|
| 556 |
|
| 557 |
|
|
@@ -574,6 +576,12 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
|
|
| 574 |
current = {"start": time, "end": time, "text": text}
|
| 575 |
if current:
|
| 576 |
collapsed.append(current)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
return collapsed
|
| 578 |
|
| 579 |
def post_edit_transcribed_segments(transcription_json, video_path,
|
|
@@ -673,7 +681,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 673 |
desired_duration = entry["end"] - entry["start"]
|
| 674 |
desired_speed = entry['speed'] #calibrated_speed(entry['translated'], desired_duration)
|
| 675 |
|
| 676 |
-
speaker = entry.get("speaker", "
|
| 677 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 678 |
|
| 679 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in tts_model.synthesizer.tts_model.language_manager.name_to_id.keys():
|
|
|
|
| 209 |
"start": segment["start"],
|
| 210 |
"end": segment["end"],
|
| 211 |
"text": segment["text"],
|
| 212 |
+
"speaker": segment.get("speaker", "SPEAKER_00")
|
| 213 |
}
|
| 214 |
for segment in result["segments"]
|
| 215 |
]
|
|
|
|
| 541 |
cap.release()
|
| 542 |
|
| 543 |
ocr_results = []
|
| 544 |
+
ocr_failures = 0 # Count OCR failures
|
| 545 |
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
| 546 |
futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
|
| 547 |
|
|
|
|
| 551 |
if result["text"]:
|
| 552 |
ocr_results.append(result)
|
| 553 |
except Exception as e:
|
| 554 |
+
ocr_failures += 1
|
| 555 |
|
| 556 |
+
logger.info(f"✅ OCR extraction completed: {len(ocr_results)} frames successful, {ocr_failures} frames failed.")
|
| 557 |
return ocr_results
|
| 558 |
|
| 559 |
|
|
|
|
| 576 |
current = {"start": time, "end": time, "text": text}
|
| 577 |
if current:
|
| 578 |
collapsed.append(current)
|
| 579 |
+
|
| 580 |
+
# Log collapsed OCR summary
|
| 581 |
+
logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
|
| 582 |
+
for idx, seg in enumerate(collapsed):
|
| 583 |
+
logger.debug(f"[OCR Collapsed {idx}] {seg['start']:.2f}s - {seg['end']:.2f}s: {seg['text'][:50]}...")
|
| 584 |
+
|
| 585 |
return collapsed
|
| 586 |
|
| 587 |
def post_edit_transcribed_segments(transcription_json, video_path,
|
|
|
|
| 681 |
desired_duration = entry["end"] - entry["start"]
|
| 682 |
desired_speed = entry['speed'] #calibrated_speed(entry['translated'], desired_duration)
|
| 683 |
|
| 684 |
+
speaker = entry.get("speaker", "SPEAKER_00")
|
| 685 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 686 |
|
| 687 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in tts_model.synthesizer.tts_model.language_manager.name_to_id.keys():
|