Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -518,14 +518,34 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
|
|
518 |
|
519 |
return original_segments
|
520 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
521 |
ocr_model = None
|
522 |
ocr_lock = threading.Lock()
|
523 |
|
524 |
-
def init_ocr_model():
|
|
|
|
|
|
|
525 |
global ocr_model
|
526 |
with ocr_lock:
|
527 |
-
if ocr_model is None:
|
528 |
-
|
|
|
|
|
|
|
|
|
529 |
|
530 |
def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
|
531 |
"""
|
@@ -579,10 +599,10 @@ def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_st
|
|
579 |
fallback_y = height - int(height * 0.2)
|
580 |
return frame[fallback_y:, :], (fallback_y, height)
|
581 |
|
582 |
-
def ocr_frame_worker(args, min_confidence=0.7):
|
583 |
frame_idx, frame_time, frame = args
|
584 |
|
585 |
-
init_ocr_model() # Load model in thread-safe way
|
586 |
|
587 |
if frame is None or frame.size == 0 or not isinstance(frame, np.ndarray):
|
588 |
return {"time": frame_time, "text": ""}
|
@@ -607,7 +627,7 @@ def frame_is_in_audio_segments(frame_time, audio_segments, tolerance=0.2):
|
|
607 |
return True
|
608 |
return False
|
609 |
|
610 |
-
def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=0.
|
611 |
cap = cv2.VideoCapture(video_path)
|
612 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
613 |
frames = []
|
@@ -626,7 +646,7 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
|
|
626 |
ocr_results = []
|
627 |
ocr_failures = 0 # Count OCR failures
|
628 |
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
629 |
-
futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
|
630 |
|
631 |
for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
632 |
try:
|
@@ -653,6 +673,7 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
|
|
653 |
sim = fuzz.ratio(current["text"], text)
|
654 |
if sim >= text_similarity_threshold:
|
655 |
current["end"] = time
|
|
|
656 |
logger.debug(f"MERGED: Current end extended to {time:.2f}s for text: '{current['text'][:50]}...' (Similarity: {sim})")
|
657 |
else:
|
658 |
logger.debug(f"NOT MERGING (Similarity: {sim} < Threshold: {text_similarity_threshold}):")
|
@@ -660,8 +681,7 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
|
|
660 |
logger.debug(f" New segment: {time:.2f}s: '{text[:50]}...'")
|
661 |
collapsed.append(current)
|
662 |
current = {"start": time, "end": time, "text": text}
|
663 |
-
|
664 |
-
collapsed.append(current)
|
665 |
|
666 |
logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
|
667 |
for idx, seg in enumerate(collapsed):
|
@@ -800,7 +820,8 @@ def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
|
|
800 |
curr["end"] = round(curr["start"] + 0.3, 3)
|
801 |
|
802 |
return merged_ocr_json
|
803 |
-
|
|
|
804 |
interval_sec=0.5,
|
805 |
text_similarity_threshold=80,
|
806 |
time_tolerance=1.0,
|
@@ -812,8 +833,9 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
812 |
|
813 |
# Step 1: Extract OCR subtitles (only near audio segments)
|
814 |
ocr_json = extract_ocr_subtitles_parallel(
|
815 |
-
video_path,
|
816 |
transcription_json,
|
|
|
817 |
interval_sec=interval_sec,
|
818 |
num_workers=num_workers
|
819 |
)
|
@@ -1132,7 +1154,7 @@ def upload_and_manage(file, target_language, process_mode):
|
|
1132 |
transcription_json, source_language = transcribe_video_with_speakers(file.name)
|
1133 |
logger.info(f"Transcription completed. Detected source language: {source_language}")
|
1134 |
|
1135 |
-
transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name)
|
1136 |
# Step 2: Translate the transcription
|
1137 |
logger.info(f"Translating transcription from {source_language} to {target_language}...")
|
1138 |
translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)
|
|
|
518 |
|
519 |
return original_segments
|
520 |
|
521 |
+
WHISPERX_TO_PADDLEOCR_LANG = {
|
522 |
+
"zh": "ch", # Chinese
|
523 |
+
"en": "en", # English
|
524 |
+
"fr": "fr", # French
|
525 |
+
"de": "german", # German
|
526 |
+
"ja": "japan", # Japanese
|
527 |
+
"ko": "korean", # Korean
|
528 |
+
"ru": "russian", # Russian
|
529 |
+
"it": "italian", # Italian
|
530 |
+
"es": "spanish", # Spanish
|
531 |
+
# Add more mappings as needed
|
532 |
+
}
|
533 |
+
|
534 |
ocr_model = None
|
535 |
ocr_lock = threading.Lock()
|
536 |
|
537 |
+
def init_ocr_model(source_lang):
|
538 |
+
"""
|
539 |
+
Initializes the PaddleOCR model using the mapped language.
|
540 |
+
"""
|
541 |
global ocr_model
|
542 |
with ocr_lock:
|
543 |
+
if ocr_model is not None:
|
544 |
+
return # already initialized
|
545 |
+
|
546 |
+
paddle_lang = WHISPERX_TO_PADDLEOCR_LANG.get(source_lang, "en")
|
547 |
+
logger.info(f"🔤 Initializing OCR model for source language: {source_lang} → PaddleOCR lang: {paddle_lang}")
|
548 |
+
ocr_model = PaddleOCR(use_angle_cls=True, lang=paddle_lang)
|
549 |
|
550 |
def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
|
551 |
"""
|
|
|
599 |
fallback_y = height - int(height * 0.2)
|
600 |
return frame[fallback_y:, :], (fallback_y, height)
|
601 |
|
602 |
+
def ocr_frame_worker(args, source_language, min_confidence=0.7):
|
603 |
frame_idx, frame_time, frame = args
|
604 |
|
605 |
+
init_ocr_model(source_language) # Load model in thread-safe way
|
606 |
|
607 |
if frame is None or frame.size == 0 or not isinstance(frame, np.ndarray):
|
608 |
return {"time": frame_time, "text": ""}
|
|
|
627 |
return True
|
628 |
return False
|
629 |
|
630 |
+
def extract_ocr_subtitles_parallel(video_path, transcription_json, source_language, interval_sec=0.2, num_workers=4):
|
631 |
cap = cv2.VideoCapture(video_path)
|
632 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
633 |
frames = []
|
|
|
646 |
ocr_results = []
|
647 |
ocr_failures = 0 # Count OCR failures
|
648 |
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
649 |
+
futures = [executor.submit(ocr_frame_worker, frame, source_language) for frame in frames]
|
650 |
|
651 |
for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
652 |
try:
|
|
|
673 |
sim = fuzz.ratio(current["text"], text)
|
674 |
if sim >= text_similarity_threshold:
|
675 |
current["end"] = time
|
676 |
+
current["text"] = text
|
677 |
logger.debug(f"MERGED: Current end extended to {time:.2f}s for text: '{current['text'][:50]}...' (Similarity: {sim})")
|
678 |
else:
|
679 |
logger.debug(f"NOT MERGING (Similarity: {sim} < Threshold: {text_similarity_threshold}):")
|
|
|
681 |
logger.debug(f" New segment: {time:.2f}s: '{text[:50]}...'")
|
682 |
collapsed.append(current)
|
683 |
current = {"start": time, "end": time, "text": text}
|
684 |
+
|
|
|
685 |
|
686 |
logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
|
687 |
for idx, seg in enumerate(collapsed):
|
|
|
820 |
curr["end"] = round(curr["start"] + 0.3, 3)
|
821 |
|
822 |
return merged_ocr_json
|
823 |
+
|
824 |
+
def post_edit_transcribed_segments(transcription_json, video_path, source_language,
|
825 |
interval_sec=0.5,
|
826 |
text_similarity_threshold=80,
|
827 |
time_tolerance=1.0,
|
|
|
833 |
|
834 |
# Step 1: Extract OCR subtitles (only near audio segments)
|
835 |
ocr_json = extract_ocr_subtitles_parallel(
|
836 |
+
video_path,
|
837 |
transcription_json,
|
838 |
+
source_language,
|
839 |
interval_sec=interval_sec,
|
840 |
num_workers=num_workers
|
841 |
)
|
|
|
1154 |
transcription_json, source_language = transcribe_video_with_speakers(file.name)
|
1155 |
logger.info(f"Transcription completed. Detected source language: {source_language}")
|
1156 |
|
1157 |
+
transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name, source_language)
|
1158 |
# Step 2: Translate the transcription
|
1159 |
logger.info(f"Translating transcription from {source_language} to {target_language}...")
|
1160 |
translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)
|