Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +190 -47
mdr_pdf_parser.py
CHANGED
|
@@ -1,3 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# --- External Library Imports ---
|
| 2 |
import os
|
| 3 |
import re
|
|
@@ -12,7 +52,7 @@ import requests # For downloading models
|
|
| 12 |
from pathlib import Path
|
| 13 |
from enum import auto, Enum
|
| 14 |
from dataclasses import dataclass
|
| 15 |
-
from typing import Literal, Iterable, Generator, Sequence, Callable, TypeAlias, List, Dict, Any
|
| 16 |
from collections import defaultdict
|
| 17 |
from math import pi, ceil, sin, cos, sqrt, atan2
|
| 18 |
from PIL.Image import Image, frombytes, new as new_image, Resampling as PILResampling, Transform as PILTransform
|
|
@@ -562,10 +602,50 @@ class _MDR_KeepKeys:
|
|
| 562 |
|
| 563 |
def __call__(self, data): return [data[key] for key in self.keep_keys]
|
| 564 |
|
| 565 |
-
def mdr_ocr_transform(
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
|
| 570 |
def mdr_ocr_create_operators(op_param_list, global_config=None):
|
| 571 |
ops = []
|
|
@@ -850,42 +930,73 @@ class _MDR_TextClassifier(_MDR_PredictBase):
|
|
| 850 |
|
| 851 |
class _MDR_BaseRecLabelDecode:
|
| 852 |
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 889 |
|
| 890 |
class _MDR_CTCLabelDecode(_MDR_BaseRecLabelDecode):
|
| 891 |
def __init__(self, char_path=None, use_space=False, **kwargs): super().__init__(char_path, use_space)
|
|
@@ -1270,12 +1381,44 @@ _MDR_OCR_MODELS = {"det": ("ppocrv4","det","det.onnx"), "cls": ("ppocrv4","cls",
|
|
| 1270 |
_MDR_OCR_URL_BASE = "https://huggingface.co/moskize/OnnxOCR/resolve/main/"
|
| 1271 |
|
| 1272 |
@dataclass
|
| 1273 |
-
class _MDR_ONNXParams:
|
| 1274 |
-
|
| 1275 |
-
|
| 1276 |
-
|
| 1277 |
-
|
| 1278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1279 |
|
| 1280 |
class MDROcrEngine:
|
| 1281 |
"""Handles OCR detection and recognition using ONNX models."""
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
# /=====================================================================\ #
|
| 3 |
+
# | MagicDataReadiness - MAGIC PDF Parser | #
|
| 4 |
+
# |---------------------------------------------------------------------| #
|
| 5 |
+
# | Description: | #
|
| 6 |
+
# | Extracts structured content (text, tables, figures, formulas) | #
|
| 7 |
+
# | from PDF documents using layout analysis and OCR. | #
|
| 8 |
+
# | Combines logic from various internal components. | #
|
| 9 |
+
# |---------------------------------------------------------------------| #
|
| 10 |
+
# | Dependencies: | #
|
| 11 |
+
# | - Python 3.11+ | #
|
| 12 |
+
# | - External Libraries (See imports below and installation notes) | #
|
| 13 |
+
# | - Pre-trained CV Models (Downloaded automatically to model dir) | #
|
| 14 |
+
# |---------------------------------------------------------------------| #
|
| 15 |
+
# | Usage: | #
|
| 16 |
+
# | See the __main__ block at the end of the script for an example. | #
|
| 17 |
+
# \=====================================================================/ #
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# -*- coding: utf-8 -*-
|
| 22 |
+
# /=====================================================================\ #
|
| 23 |
+
# | MagicDataReadiness - MAGIC PDF Parser | #
|
| 24 |
+
# |---------------------------------------------------------------------| #
|
| 25 |
+
# | Description: | #
|
| 26 |
+
# | Extracts structured content (text, tables, figures, formulas) | #
|
| 27 |
+
# | from PDF documents using layout analysis and OCR. | #
|
| 28 |
+
# | Combines logic from various internal components. | #
|
| 29 |
+
# |---------------------------------------------------------------------| #
|
| 30 |
+
# | Dependencies: | #
|
| 31 |
+
# | - Python 3.11+ | #
|
| 32 |
+
# | - External Libraries (See imports below and installation notes) | #
|
| 33 |
+
# | - Pre-trained CV Models (Downloaded automatically to model dir) | #
|
| 34 |
+
# |---------------------------------------------------------------------| #
|
| 35 |
+
# | Usage: | #
|
| 36 |
+
# | See the __main__ block at the end of the script for an example. | #
|
| 37 |
+
# \=====================================================================/ #
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
# --- External Library Imports ---
|
| 42 |
import os
|
| 43 |
import re
|
|
|
|
| 52 |
from pathlib import Path
|
| 53 |
from enum import auto, Enum
|
| 54 |
from dataclasses import dataclass
|
| 55 |
+
from typing import Literal, Iterable, Generator, Sequence, Callable, TypeAlias, List, Dict, Any, Optional
|
| 56 |
from collections import defaultdict
|
| 57 |
from math import pi, ceil, sin, cos, sqrt, atan2
|
| 58 |
from PIL.Image import Image, frombytes, new as new_image, Resampling as PILResampling, Transform as PILTransform
|
|
|
|
| 602 |
|
| 603 |
def __call__(self, data): return [data[key] for key in self.keep_keys]
|
| 604 |
|
| 605 |
+
def mdr_ocr_transform(
|
| 606 |
+
data: Any,
|
| 607 |
+
ops: Optional[List[Callable[[Any], Optional[Any]]]] = None
|
| 608 |
+
) -> Optional[Any]:
|
| 609 |
+
"""
|
| 610 |
+
Applies a sequence of transformation operations to the input data.
|
| 611 |
+
|
| 612 |
+
This function iterates through a list of operations (callables) and
|
| 613 |
+
applies each one sequentially to the data. If any operation
|
| 614 |
+
returns None, the processing stops immediately, and None is returned.
|
| 615 |
+
|
| 616 |
+
Args:
|
| 617 |
+
data: The initial data to be transformed. Can be of any type
|
| 618 |
+
compatible with the operations.
|
| 619 |
+
ops: An optional list of callable operations. Each operation
|
| 620 |
+
should accept the current state of the data and return
|
| 621 |
+
the transformed data or None to signal an early exit.
|
| 622 |
+
If None or an empty list is provided, the original data
|
| 623 |
+
is returned unchanged.
|
| 624 |
+
|
| 625 |
+
Returns:
|
| 626 |
+
The transformed data after applying all operations successfully,
|
| 627 |
+
or None if any operation in the sequence returned None.
|
| 628 |
+
"""
|
| 629 |
+
# Use an empty list if ops is None to avoid errors when iterating
|
| 630 |
+
# and to represent "no operations" gracefully.
|
| 631 |
+
if ops is None:
|
| 632 |
+
operations_to_apply = []
|
| 633 |
+
else:
|
| 634 |
+
operations_to_apply = ops
|
| 635 |
+
|
| 636 |
+
current_data = data # Use a separate variable to track the evolving data
|
| 637 |
+
|
| 638 |
+
# Sequentially apply each operation
|
| 639 |
+
for op in operations_to_apply:
|
| 640 |
+
current_data = op(current_data) # Apply the operation
|
| 641 |
+
|
| 642 |
+
# Check if the operation signaled failure or requested early exit
|
| 643 |
+
# by returning None.
|
| 644 |
+
if current_data is None:
|
| 645 |
+
return None # Short-circuit the pipeline
|
| 646 |
+
|
| 647 |
+
# If the loop completes without returning None, all operations succeeded.
|
| 648 |
+
return current_data
|
| 649 |
|
| 650 |
def mdr_ocr_create_operators(op_param_list, global_config=None):
|
| 651 |
ops = []
|
|
|
|
| 930 |
|
| 931 |
class _MDR_BaseRecLabelDecode:
|
| 932 |
|
| 933 |
+
def __init__(self, char_path=None, use_space=False):
|
| 934 |
+
self.beg, self.end, self.rev = "sos", "eos", False
|
| 935 |
+
self.chars = []
|
| 936 |
+
if char_path is None:
|
| 937 |
+
self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
|
| 938 |
+
else:
|
| 939 |
+
try:
|
| 940 |
+
with open(char_path, "rb") as f:
|
| 941 |
+
self.chars = [l.decode("utf-8").strip("\n\r") for l in f]
|
| 942 |
+
if use_space:
|
| 943 |
+
self.chars.append(" ")
|
| 944 |
+
if any("\u0600" <= c <= "\u06FF" for c in self.chars):
|
| 945 |
+
self.rev = True
|
| 946 |
+
except FileNotFoundError:
|
| 947 |
+
print(f"Warn: Dict not found {char_path}")
|
| 948 |
+
self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
|
| 949 |
+
if use_space:
|
| 950 |
+
self.chars.append(" ")
|
| 951 |
+
d_char = self.add_special_char(list(self.chars))
|
| 952 |
+
self.dict = {c: i for i, c in enumerate(d_char)}
|
| 953 |
+
self.character = d_char
|
| 954 |
+
|
| 955 |
+
def add_special_char(self, chars):
|
| 956 |
+
return chars
|
| 957 |
+
|
| 958 |
+
def get_ignored_tokens(self):
|
| 959 |
+
return []
|
| 960 |
+
|
| 961 |
+
def _reverse(self, pred):
|
| 962 |
+
res = []
|
| 963 |
+
cur = ""
|
| 964 |
+
for c in pred:
|
| 965 |
+
if not re.search("[a-zA-Z0-9 :*./%+-]", c):
|
| 966 |
+
if cur != "":
|
| 967 |
+
res.extend([cur, c])
|
| 968 |
+
else:
|
| 969 |
+
res.extend([c])
|
| 970 |
+
cur = ""
|
| 971 |
+
else:
|
| 972 |
+
cur += c
|
| 973 |
+
if cur != "":
|
| 974 |
+
res.append(cur)
|
| 975 |
+
return "".join(res[::-1])
|
| 976 |
+
|
| 977 |
+
def decode(self, idxs, probs=None, remove_dup=False):
|
| 978 |
+
res = []
|
| 979 |
+
ignored = self.get_ignored_tokens()
|
| 980 |
+
bs = len(idxs)
|
| 981 |
+
for b_idx in range(bs):
|
| 982 |
+
sel = np.ones(len(idxs[b_idx]), dtype=bool)
|
| 983 |
+
if remove_dup:
|
| 984 |
+
sel[1:] = idxs[b_idx][1:] != idxs[b_idx][:-1]
|
| 985 |
+
for ig_tok in ignored:
|
| 986 |
+
sel &= idxs[b_idx] != ig_tok
|
| 987 |
+
char_l = [
|
| 988 |
+
self.character[tid]
|
| 989 |
+
for tid in idxs[b_idx][sel]
|
| 990 |
+
if 0 <= tid < len(self.character)
|
| 991 |
+
]
|
| 992 |
+
conf_l = probs[b_idx][sel] if probs is not None else [1] * len(char_l)
|
| 993 |
+
if len(conf_l) == 0:
|
| 994 |
+
conf_l = [0]
|
| 995 |
+
txt = "".join(char_l)
|
| 996 |
+
if self.rev:
|
| 997 |
+
txt = self._reverse(txt)
|
| 998 |
+
res.append((txt, float(np.mean(conf_l))))
|
| 999 |
+
return res
|
| 1000 |
|
| 1001 |
class _MDR_CTCLabelDecode(_MDR_BaseRecLabelDecode):
|
| 1002 |
def __init__(self, char_path=None, use_space=False, **kwargs): super().__init__(char_path, use_space)
|
|
|
|
| 1381 |
_MDR_OCR_URL_BASE = "https://huggingface.co/moskize/OnnxOCR/resolve/main/"
|
| 1382 |
|
| 1383 |
@dataclass
|
| 1384 |
+
class _MDR_ONNXParams:
|
| 1385 |
+
# Attributes without default values
|
| 1386 |
+
use_gpu: bool
|
| 1387 |
+
det_model_dir: str
|
| 1388 |
+
cls_model_dir: str
|
| 1389 |
+
rec_model_dir: str
|
| 1390 |
+
rec_char_dict_path: str
|
| 1391 |
+
|
| 1392 |
+
# Attributes with default values (Group 1)
|
| 1393 |
+
use_angle_cls: bool = True
|
| 1394 |
+
rec_image_shape: str = "3,48,320"
|
| 1395 |
+
cls_image_shape: str = "3,48,192"
|
| 1396 |
+
cls_batch_num: int = 6
|
| 1397 |
+
cls_thresh: float = 0.9
|
| 1398 |
+
label_list: list = ['0', '180']
|
| 1399 |
+
|
| 1400 |
+
# Attributes with default values (Group 2 - Detection)
|
| 1401 |
+
det_algorithm: str = "DB"
|
| 1402 |
+
det_limit_side_len: int = 960
|
| 1403 |
+
det_limit_type: str = 'max'
|
| 1404 |
+
det_db_thresh: float = 0.3
|
| 1405 |
+
det_db_box_thresh: float = 0.6
|
| 1406 |
+
det_db_unclip_ratio: float = 1.5
|
| 1407 |
+
use_dilation: bool = False
|
| 1408 |
+
det_db_score_mode: str = 'fast'
|
| 1409 |
+
det_box_type: str = 'quad'
|
| 1410 |
+
|
| 1411 |
+
# Attributes with default values (Group 3 - Recognition & General)
|
| 1412 |
+
rec_batch_num: int = 6
|
| 1413 |
+
drop_score: float = 0.5
|
| 1414 |
+
rec_algorithm: str = "SVTR_LCNet"
|
| 1415 |
+
use_space_char: bool = True
|
| 1416 |
+
|
| 1417 |
+
# Attributes with default values (Group 4 - Output & Logging)
|
| 1418 |
+
save_crop_res: bool = False
|
| 1419 |
+
crop_res_save_dir: str = "./output/mdr_crop_res"
|
| 1420 |
+
show_log: bool = False
|
| 1421 |
+
use_onnx: bool = True
|
| 1422 |
|
| 1423 |
class MDROcrEngine:
|
| 1424 |
"""Handles OCR detection and recognition using ONNX models."""
|