qqwjq1981 commited on
Commit
e418f7b
·
verified ·
1 Parent(s): a17cccf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -12
app.py CHANGED
@@ -518,14 +518,34 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
518
 
519
  return original_segments
520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  ocr_model = None
522
  ocr_lock = threading.Lock()
523
 
524
- def init_ocr_model():
 
 
 
525
  global ocr_model
526
  with ocr_lock:
527
- if ocr_model is None:
528
- ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
 
 
 
 
529
 
530
  def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
531
  """
@@ -579,10 +599,10 @@ def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_st
579
  fallback_y = height - int(height * 0.2)
580
  return frame[fallback_y:, :], (fallback_y, height)
581
 
582
- def ocr_frame_worker(args, min_confidence=0.7):
583
  frame_idx, frame_time, frame = args
584
 
585
- init_ocr_model() # Load model in thread-safe way
586
 
587
  if frame is None or frame.size == 0 or not isinstance(frame, np.ndarray):
588
  return {"time": frame_time, "text": ""}
@@ -607,7 +627,7 @@ def frame_is_in_audio_segments(frame_time, audio_segments, tolerance=0.2):
607
  return True
608
  return False
609
 
610
- def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=0.5, num_workers=4):
611
  cap = cv2.VideoCapture(video_path)
612
  fps = cap.get(cv2.CAP_PROP_FPS)
613
  frames = []
@@ -626,7 +646,7 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
626
  ocr_results = []
627
  ocr_failures = 0 # Count OCR failures
628
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
629
- futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
630
 
631
  for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
632
  try:
@@ -653,6 +673,7 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
653
  sim = fuzz.ratio(current["text"], text)
654
  if sim >= text_similarity_threshold:
655
  current["end"] = time
 
656
  logger.debug(f"MERGED: Current end extended to {time:.2f}s for text: '{current['text'][:50]}...' (Similarity: {sim})")
657
  else:
658
  logger.debug(f"NOT MERGING (Similarity: {sim} < Threshold: {text_similarity_threshold}):")
@@ -660,8 +681,7 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
660
  logger.debug(f" New segment: {time:.2f}s: '{text[:50]}...'")
661
  collapsed.append(current)
662
  current = {"start": time, "end": time, "text": text}
663
- if current:
664
- collapsed.append(current)
665
 
666
  logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
667
  for idx, seg in enumerate(collapsed):
@@ -800,7 +820,8 @@ def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
800
  curr["end"] = round(curr["start"] + 0.3, 3)
801
 
802
  return merged_ocr_json
803
- def post_edit_transcribed_segments(transcription_json, video_path,
 
804
  interval_sec=0.5,
805
  text_similarity_threshold=80,
806
  time_tolerance=1.0,
@@ -812,8 +833,9 @@ def post_edit_transcribed_segments(transcription_json, video_path,
812
 
813
  # Step 1: Extract OCR subtitles (only near audio segments)
814
  ocr_json = extract_ocr_subtitles_parallel(
815
- video_path,
816
  transcription_json,
 
817
  interval_sec=interval_sec,
818
  num_workers=num_workers
819
  )
@@ -1132,7 +1154,7 @@ def upload_and_manage(file, target_language, process_mode):
1132
  transcription_json, source_language = transcribe_video_with_speakers(file.name)
1133
  logger.info(f"Transcription completed. Detected source language: {source_language}")
1134
 
1135
- transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name)
1136
  # Step 2: Translate the transcription
1137
  logger.info(f"Translating transcription from {source_language} to {target_language}...")
1138
  translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)
 
518
 
519
  return original_segments
520
 
521
+ WHISPERX_TO_PADDLEOCR_LANG = {
522
+ "zh": "ch", # Chinese
523
+ "en": "en", # English
524
+ "fr": "fr", # French
525
+ "de": "german", # German
526
+ "ja": "japan", # Japanese
527
+ "ko": "korean", # Korean
528
+ "ru": "russian", # Russian
529
+ "it": "italian", # Italian
530
+ "es": "spanish", # Spanish
531
+ # Add more mappings as needed
532
+ }
533
+
534
  ocr_model = None
535
  ocr_lock = threading.Lock()
536
 
537
+ def init_ocr_model(source_lang):
538
+ """
539
+ Initializes the PaddleOCR model using the mapped language.
540
+ """
541
  global ocr_model
542
  with ocr_lock:
543
+ if ocr_model is not None:
544
+ return # already initialized
545
+
546
+ paddle_lang = WHISPERX_TO_PADDLEOCR_LANG.get(source_lang, "en")
547
+ logger.info(f"🔤 Initializing OCR model for source language: {source_lang} → PaddleOCR lang: {paddle_lang}")
548
+ ocr_model = PaddleOCR(use_angle_cls=True, lang=paddle_lang)
549
 
550
  def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
551
  """
 
599
  fallback_y = height - int(height * 0.2)
600
  return frame[fallback_y:, :], (fallback_y, height)
601
 
602
+ def ocr_frame_worker(args, source_language, min_confidence=0.7):
603
  frame_idx, frame_time, frame = args
604
 
605
+ init_ocr_model(source_language) # Load model in thread-safe way
606
 
607
  if frame is None or frame.size == 0 or not isinstance(frame, np.ndarray):
608
  return {"time": frame_time, "text": ""}
 
627
  return True
628
  return False
629
 
630
+ def extract_ocr_subtitles_parallel(video_path, transcription_json, source_language, interval_sec=0.2, num_workers=4):
631
  cap = cv2.VideoCapture(video_path)
632
  fps = cap.get(cv2.CAP_PROP_FPS)
633
  frames = []
 
646
  ocr_results = []
647
  ocr_failures = 0 # Count OCR failures
648
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
649
+ futures = [executor.submit(ocr_frame_worker, frame, source_language) for frame in frames]
650
 
651
  for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
652
  try:
 
673
  sim = fuzz.ratio(current["text"], text)
674
  if sim >= text_similarity_threshold:
675
  current["end"] = time
676
+ current["text"] = text
677
  logger.debug(f"MERGED: Current end extended to {time:.2f}s for text: '{current['text'][:50]}...' (Similarity: {sim})")
678
  else:
679
  logger.debug(f"NOT MERGING (Similarity: {sim} < Threshold: {text_similarity_threshold}):")
 
681
  logger.debug(f" New segment: {time:.2f}s: '{text[:50]}...'")
682
  collapsed.append(current)
683
  current = {"start": time, "end": time, "text": text}
684
+
 
685
 
686
  logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
687
  for idx, seg in enumerate(collapsed):
 
820
  curr["end"] = round(curr["start"] + 0.3, 3)
821
 
822
  return merged_ocr_json
823
+
824
+ def post_edit_transcribed_segments(transcription_json, video_path, source_language,
825
  interval_sec=0.5,
826
  text_similarity_threshold=80,
827
  time_tolerance=1.0,
 
833
 
834
  # Step 1: Extract OCR subtitles (only near audio segments)
835
  ocr_json = extract_ocr_subtitles_parallel(
836
+ video_path,
837
  transcription_json,
838
+ source_language,
839
  interval_sec=interval_sec,
840
  num_workers=num_workers
841
  )
 
1154
  transcription_json, source_language = transcribe_video_with_speakers(file.name)
1155
  logger.info(f"Transcription completed. Detected source language: {source_language}")
1156
 
1157
+ transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name, source_language)
1158
  # Step 2: Translate the transcription
1159
  logger.info(f"Translating transcription from {source_language} to {target_language}...")
1160
  translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)