Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -494,6 +494,58 @@ def init_ocr_model():
|
|
| 494 |
if ocr_model is None:
|
| 495 |
ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
|
| 496 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
def ocr_frame_worker(args):
|
| 498 |
frame_idx, frame_time, frame = args
|
| 499 |
|
|
@@ -509,7 +561,8 @@ def ocr_frame_worker(args):
|
|
| 509 |
frame = frame.astype(np.uint8)
|
| 510 |
|
| 511 |
try:
|
| 512 |
-
|
|
|
|
| 513 |
texts = [line[1][0] for line in result[0]] if result[0] else []
|
| 514 |
combined_text = " ".join(texts).strip()
|
| 515 |
return {"time": frame_time, "text": combined_text}
|
|
|
|
| 494 |
if ocr_model is None:
|
| 495 |
ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
|
| 496 |
|
| 497 |
+
def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
|
| 498 |
+
"""
|
| 499 |
+
Automatically identifies the best subtitle region in a video frame using OCR confidence.
|
| 500 |
+
|
| 501 |
+
Parameters:
|
| 502 |
+
- frame: full video frame (BGR np.ndarray)
|
| 503 |
+
- ocr_model: a loaded PaddleOCR model
|
| 504 |
+
- region_height_ratio: portion of image height to scan (from bottom up)
|
| 505 |
+
- num_strips: how many horizontal strips to evaluate
|
| 506 |
+
- min_conf: minimum average confidence to consider a region valid
|
| 507 |
+
|
| 508 |
+
Returns:
|
| 509 |
+
- crop_region: the cropped image region with highest OCR confidence
|
| 510 |
+
- region_box: (y_start, y_end) of the region in the original frame
|
| 511 |
+
"""
|
| 512 |
+
height, width, _ = frame.shape
|
| 513 |
+
region_height = int(height * region_height_ratio)
|
| 514 |
+
base_y_start = height - region_height
|
| 515 |
+
strip_height = region_height // num_strips
|
| 516 |
+
|
| 517 |
+
best_score = -1
|
| 518 |
+
best_crop = None
|
| 519 |
+
best_bounds = (0, height)
|
| 520 |
+
|
| 521 |
+
for i in range(num_strips):
|
| 522 |
+
y_start = base_y_start + i * strip_height
|
| 523 |
+
y_end = y_start + strip_height
|
| 524 |
+
strip = frame[y_start:y_end, :]
|
| 525 |
+
|
| 526 |
+
try:
|
| 527 |
+
result = ocr_model.ocr(strip, cls=True)
|
| 528 |
+
if not result or not result[0]:
|
| 529 |
+
continue
|
| 530 |
+
|
| 531 |
+
total_score = sum(line[1][1] for line in result[0])
|
| 532 |
+
avg_score = total_score / len(result[0])
|
| 533 |
+
|
| 534 |
+
if avg_score > best_score:
|
| 535 |
+
best_score = avg_score
|
| 536 |
+
best_crop = strip
|
| 537 |
+
best_bounds = (y_start, y_end)
|
| 538 |
+
|
| 539 |
+
except Exception as e:
|
| 540 |
+
continue # Fail silently on OCR issues
|
| 541 |
+
|
| 542 |
+
if best_score >= min_conf and best_crop is not None:
|
| 543 |
+
return best_crop, best_bounds
|
| 544 |
+
else:
|
| 545 |
+
# Fallback to center-bottom strip
|
| 546 |
+
fallback_y = height - int(height * 0.2)
|
| 547 |
+
return frame[fallback_y:, :], (fallback_y, height)
|
| 548 |
+
|
| 549 |
def ocr_frame_worker(args):
|
| 550 |
frame_idx, frame_time, frame = args
|
| 551 |
|
|
|
|
| 561 |
frame = frame.astype(np.uint8)
|
| 562 |
|
| 563 |
try:
|
| 564 |
+
subtitle_crop, _ = find_best_subtitle_region(frame, ocr_model)
|
| 565 |
+
result = ocr_model.ocr(subtitle_crop, cls=True)
|
| 566 |
texts = [line[1][0] for line in result[0]] if result[0] else []
|
| 567 |
combined_text = " ".join(texts).strip()
|
| 568 |
return {"time": frame_time, "text": combined_text}
|