studio_V1_4_OCR_SOTA

Sleeping

App Files Files Community

qqwjq1981 commited on Apr 30

Commit

c6f940f

verified ·

1 Parent(s): 548c12a

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -1

app.py CHANGED Viewed

@@ -494,6 +494,58 @@ def init_ocr_model():
         if ocr_model is None:
             ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
 def ocr_frame_worker(args):
     frame_idx, frame_time, frame = args
@@ -509,7 +561,8 @@ def ocr_frame_worker(args):
         frame = frame.astype(np.uint8)
     try:
-        result = ocr_model.ocr(frame, cls=True)
         texts = [line[1][0] for line in result[0]] if result[0] else []
         combined_text = " ".join(texts).strip()
         return {"time": frame_time, "text": combined_text}

         if ocr_model is None:
             ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
+def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
+    """
+    Automatically identifies the best subtitle region in a video frame using OCR confidence.
+    Parameters:
+    - frame: full video frame (BGR np.ndarray)
+    - ocr_model: a loaded PaddleOCR model
+    - region_height_ratio: portion of image height to scan (from bottom up)
+    - num_strips: how many horizontal strips to evaluate
+    - min_conf: minimum average confidence to consider a region valid
+    Returns:
+    - crop_region: the cropped image region with highest OCR confidence
+    - region_box: (y_start, y_end) of the region in the original frame
+    """
+    height, width, _ = frame.shape
+    region_height = int(height * region_height_ratio)
+    base_y_start = height - region_height
+    strip_height = region_height // num_strips
+    best_score = -1
+    best_crop = None
+    best_bounds = (0, height)
+    for i in range(num_strips):
+        y_start = base_y_start + i * strip_height
+        y_end = y_start + strip_height
+        strip = frame[y_start:y_end, :]
+        try:
+            result = ocr_model.ocr(strip, cls=True)
+            if not result or not result[0]:
+                continue
+            total_score = sum(line[1][1] for line in result[0])
+            avg_score = total_score / len(result[0])
+            if avg_score > best_score:
+                best_score = avg_score
+                best_crop = strip
+                best_bounds = (y_start, y_end)
+        except Exception as e:
+            continue  # Fail silently on OCR issues
+    if best_score >= min_conf and best_crop is not None:
+        return best_crop, best_bounds
+    else:
+        # Fallback to center-bottom strip
+        fallback_y = height - int(height * 0.2)
+        return frame[fallback_y:, :], (fallback_y, height)
 def ocr_frame_worker(args):
     frame_idx, frame_time, frame = args
         frame = frame.astype(np.uint8)
     try:
+        subtitle_crop, _ = find_best_subtitle_region(frame, ocr_model)
+        result = ocr_model.ocr(subtitle_crop, cls=True)
         texts = [line[1][0] for line in result[0]] if result[0] else []
         combined_text = " ".join(texts).strip()
         return {"time": frame_time, "text": combined_text}