studio_V1_4_OCR_SOTA

Sleeping

App Files Files Community

qqwjq1981 commited on Jun 2

Commit

e418f7b

verified ·

1 Parent(s): a17cccf

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -12

app.py CHANGED Viewed

@@ -518,14 +518,34 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
     return original_segments
 ocr_model = None
 ocr_lock = threading.Lock()
-def init_ocr_model():
     global ocr_model
     with ocr_lock:
-        if ocr_model is None:
-            ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
 def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
     """
@@ -579,10 +599,10 @@ def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_st
         fallback_y = height - int(height * 0.2)
         return frame[fallback_y:, :], (fallback_y, height)
-def ocr_frame_worker(args, min_confidence=0.7):
     frame_idx, frame_time, frame = args
-    init_ocr_model()  # Load model in thread-safe way
     if frame is None or frame.size == 0 or not isinstance(frame, np.ndarray):
         return {"time": frame_time, "text": ""}
@@ -607,7 +627,7 @@ def frame_is_in_audio_segments(frame_time, audio_segments, tolerance=0.2):
             return True
     return False
-def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=0.5, num_workers=4):
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     frames = []
@@ -626,7 +646,7 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
     ocr_results = []
     ocr_failures = 0  # Count OCR failures
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
-        futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
         for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
             try:
@@ -653,6 +673,7 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
         sim = fuzz.ratio(current["text"], text)
         if sim >= text_similarity_threshold:
             current["end"] = time
             logger.debug(f"MERGED: Current end extended to {time:.2f}s for text: '{current['text'][:50]}...' (Similarity: {sim})")
         else:
             logger.debug(f"NOT MERGING (Similarity: {sim} < Threshold: {text_similarity_threshold}):")
@@ -660,8 +681,7 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
             logger.debug(f"  New segment: {time:.2f}s: '{text[:50]}...'")
             collapsed.append(current)
             current = {"start": time, "end": time, "text": text}
-    if current:
-        collapsed.append(current)
     logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
     for idx, seg in enumerate(collapsed):
@@ -800,7 +820,8 @@ def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
                 curr["end"] = round(curr["start"] + 0.3, 3)
     return merged_ocr_json
-def post_edit_transcribed_segments(transcription_json, video_path,
                                    interval_sec=0.5,
                                    text_similarity_threshold=80,
                                    time_tolerance=1.0,
@@ -812,8 +833,9 @@ def post_edit_transcribed_segments(transcription_json, video_path,
     # Step 1: Extract OCR subtitles (only near audio segments)
     ocr_json = extract_ocr_subtitles_parallel(
-        video_path,
         transcription_json,
         interval_sec=interval_sec,
         num_workers=num_workers
     )
@@ -1132,7 +1154,7 @@ def upload_and_manage(file, target_language, process_mode):
         transcription_json, source_language = transcribe_video_with_speakers(file.name)
         logger.info(f"Transcription completed. Detected source language: {source_language}")
-        transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name)
         # Step 2: Translate the transcription
         logger.info(f"Translating transcription from {source_language} to {target_language}...")
         translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)

     return original_segments
+WHISPERX_TO_PADDLEOCR_LANG = {
+    "zh": "ch",        # Chinese
+    "en": "en",        # English
+    "fr": "fr",        # French
+    "de": "german",    # German
+    "ja": "japan",     # Japanese
+    "ko": "korean",    # Korean
+    "ru": "russian",   # Russian
+    "it": "italian",   # Italian
+    "es": "spanish",   # Spanish
+    # Add more mappings as needed
+}
 ocr_model = None
 ocr_lock = threading.Lock()
+def init_ocr_model(source_lang):
+    """
+    Initializes the PaddleOCR model using the mapped language.
+    """
     global ocr_model
     with ocr_lock:
+        if ocr_model is not None:
+            return  # already initialized
+        paddle_lang = WHISPERX_TO_PADDLEOCR_LANG.get(source_lang, "en")
+        logger.info(f"🔤 Initializing OCR model for source language: {source_lang} → PaddleOCR lang: {paddle_lang}")
+        ocr_model = PaddleOCR(use_angle_cls=True, lang=paddle_lang)
 def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
     """
         fallback_y = height - int(height * 0.2)
         return frame[fallback_y:, :], (fallback_y, height)
+def ocr_frame_worker(args, source_language, min_confidence=0.7):
     frame_idx, frame_time, frame = args
+    init_ocr_model(source_language)  # Load model in thread-safe way
     if frame is None or frame.size == 0 or not isinstance(frame, np.ndarray):
         return {"time": frame_time, "text": ""}
             return True
     return False
+def extract_ocr_subtitles_parallel(video_path, transcription_json, source_language, interval_sec=0.2, num_workers=4):
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     frames = []
     ocr_results = []
     ocr_failures = 0  # Count OCR failures
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+        futures = [executor.submit(ocr_frame_worker, frame, source_language) for frame in frames]
         for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
             try:
         sim = fuzz.ratio(current["text"], text)
         if sim >= text_similarity_threshold:
             current["end"] = time
+            current["text"] = text
             logger.debug(f"MERGED: Current end extended to {time:.2f}s for text: '{current['text'][:50]}...' (Similarity: {sim})")
         else:
             logger.debug(f"NOT MERGING (Similarity: {sim} < Threshold: {text_similarity_threshold}):")
             logger.debug(f"  New segment: {time:.2f}s: '{text[:50]}...'")
             collapsed.append(current)
             current = {"start": time, "end": time, "text": text}
     logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
     for idx, seg in enumerate(collapsed):
                 curr["end"] = round(curr["start"] + 0.3, 3)
     return merged_ocr_json
+def post_edit_transcribed_segments(transcription_json, video_path, source_language,
                                    interval_sec=0.5,
                                    text_similarity_threshold=80,
                                    time_tolerance=1.0,
     # Step 1: Extract OCR subtitles (only near audio segments)
     ocr_json = extract_ocr_subtitles_parallel(
+        video_path,
         transcription_json,
+        source_language,
         interval_sec=interval_sec,
         num_workers=num_workers
     )
         transcription_json, source_language = transcribe_video_with_speakers(file.name)
         logger.info(f"Transcription completed. Detected source language: {source_language}")
+        transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name, source_language)
         # Step 2: Translate the transcription
         logger.info(f"Translating transcription from {source_language} to {target_language}...")
         translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)