KevinHuSh
commited on
Commit
·
e34cb81
1
Parent(s):
657bc8a
refine pdf parser, add time zone to userinfo (#112)
Browse files- api/db/db_models.py +1 -0
- deepdoc/parser/pdf_parser.py +15 -10
- deepdoc/vision/layout_recognizer.py +2 -2
- deepdoc/vision/ocr.py +0 -1
- deepdoc/vision/recognizer.py +6 -6
- rag/app/naive.py +2 -2
- rag/nlp/search.py +2 -2
api/db/db_models.py
CHANGED
|
@@ -354,6 +354,7 @@ class User(DataBaseModel, UserMixin):
|
|
| 354 |
avatar = TextField(null=True, help_text="avatar base64 string")
|
| 355 |
language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
|
| 356 |
color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark")
|
|
|
|
| 357 |
last_login_time = DateTimeField(null=True)
|
| 358 |
is_authenticated = CharField(max_length=1, null=False, default="1")
|
| 359 |
is_active = CharField(max_length=1, null=False, default="1")
|
|
|
|
| 354 |
avatar = TextField(null=True, help_text="avatar base64 string")
|
| 355 |
language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
|
| 356 |
color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark")
|
| 357 |
+
timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai")
|
| 358 |
last_login_time = DateTimeField(null=True)
|
| 359 |
is_authenticated = CharField(max_length=1, null=False, default="1")
|
| 360 |
is_active = CharField(max_length=1, null=False, default="1")
|
deepdoc/parser/pdf_parser.py
CHANGED
|
@@ -313,9 +313,19 @@ class HuParser:
|
|
| 313 |
while i < len(bxs) - 1:
|
| 314 |
b = bxs[i]
|
| 315 |
b_ = bxs[i + 1]
|
| 316 |
-
if b.get("layoutno", "0") != b_.get("layoutno", "1"):
|
| 317 |
i += 1
|
| 318 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
dis_thr = 1
|
| 321 |
dis = b["x1"] - b_["x0"]
|
|
@@ -642,9 +652,9 @@ class HuParser:
|
|
| 642 |
|
| 643 |
tk, tv = nearest(tables)
|
| 644 |
fk, fv = nearest(figures)
|
| 645 |
-
if min(tv, fv) > 2000:
|
| 646 |
-
|
| 647 |
-
|
| 648 |
if tv < fv:
|
| 649 |
tables[tk].insert(0, c)
|
| 650 |
logging.debug(
|
|
@@ -711,12 +721,7 @@ class HuParser:
|
|
| 711 |
|
| 712 |
# crop figure out and add caption
|
| 713 |
for k, bxs in figures.items():
|
| 714 |
-
txt = "\n".join(
|
| 715 |
-
[b["text"] for b in bxs
|
| 716 |
-
if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
|
| 717 |
-
and len(b["text"].strip()) >= 4
|
| 718 |
-
]
|
| 719 |
-
)
|
| 720 |
if not txt:
|
| 721 |
continue
|
| 722 |
|
|
|
|
| 313 |
while i < len(bxs) - 1:
|
| 314 |
b = bxs[i]
|
| 315 |
b_ = bxs[i + 1]
|
| 316 |
+
if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
|
| 317 |
i += 1
|
| 318 |
continue
|
| 319 |
+
if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
|
| 320 |
+
# merge
|
| 321 |
+
bxs[i]["x1"] = b_["x1"]
|
| 322 |
+
bxs[i]["top"] = (b["top"] + b_["top"]) / 2
|
| 323 |
+
bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
|
| 324 |
+
bxs[i]["text"] += b_["text"]
|
| 325 |
+
bxs.pop(i + 1)
|
| 326 |
+
continue
|
| 327 |
+
i += 1
|
| 328 |
+
continue
|
| 329 |
|
| 330 |
dis_thr = 1
|
| 331 |
dis = b["x1"] - b_["x0"]
|
|
|
|
| 652 |
|
| 653 |
tk, tv = nearest(tables)
|
| 654 |
fk, fv = nearest(figures)
|
| 655 |
+
#if min(tv, fv) > 2000:
|
| 656 |
+
# i += 1
|
| 657 |
+
# continue
|
| 658 |
if tv < fv:
|
| 659 |
tables[tk].insert(0, c)
|
| 660 |
logging.debug(
|
|
|
|
| 721 |
|
| 722 |
# crop figure out and add caption
|
| 723 |
for k, bxs in figures.items():
|
| 724 |
+
txt = "\n".join([b["text"] for b in bxs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
if not txt:
|
| 726 |
continue
|
| 727 |
|
deepdoc/vision/layout_recognizer.py
CHANGED
|
@@ -96,7 +96,7 @@ class LayoutRecognizer(Recognizer):
|
|
| 96 |
continue
|
| 97 |
|
| 98 |
bxs[i]["layoutno"] = f"{ty}-{ii}"
|
| 99 |
-
bxs[i]["layout_type"] = lts_[ii]["type"]
|
| 100 |
i += 1
|
| 101 |
|
| 102 |
for lt in ["footer", "header", "reference", "figure caption",
|
|
@@ -105,7 +105,7 @@ class LayoutRecognizer(Recognizer):
|
|
| 105 |
|
| 106 |
# add box to figure layouts which has not text box
|
| 107 |
for i, lt in enumerate(
|
| 108 |
-
[lt for lt in lts if lt["type"]
|
| 109 |
if lt.get("visited"):
|
| 110 |
continue
|
| 111 |
lt = deepcopy(lt)
|
|
|
|
| 96 |
continue
|
| 97 |
|
| 98 |
bxs[i]["layoutno"] = f"{ty}-{ii}"
|
| 99 |
+
bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure"
|
| 100 |
i += 1
|
| 101 |
|
| 102 |
for lt in ["footer", "header", "reference", "figure caption",
|
|
|
|
| 105 |
|
| 106 |
# add box to figure layouts which has not text box
|
| 107 |
for i, lt in enumerate(
|
| 108 |
+
[lt for lt in lts if lt["type"] in ["figure","equation"]]):
|
| 109 |
if lt.get("visited"):
|
| 110 |
continue
|
| 111 |
lt = deepcopy(lt)
|
deepdoc/vision/ocr.py
CHANGED
|
@@ -21,7 +21,6 @@ from .operators import *
|
|
| 21 |
import numpy as np
|
| 22 |
import onnxruntime as ort
|
| 23 |
|
| 24 |
-
from api.utils.file_utils import get_project_base_directory
|
| 25 |
from .postprocess import build_post_process
|
| 26 |
from rag.settings import cron_logger
|
| 27 |
|
|
|
|
| 21 |
import numpy as np
|
| 22 |
import onnxruntime as ort
|
| 23 |
|
|
|
|
| 24 |
from .postprocess import build_post_process
|
| 25 |
from rag.settings import cron_logger
|
| 26 |
|
deepdoc/vision/recognizer.py
CHANGED
|
@@ -276,18 +276,18 @@ class Recognizer(object):
|
|
| 276 |
def find_overlapped_with_threashold(box, boxes, thr=0.3):
|
| 277 |
if not boxes:
|
| 278 |
return
|
| 279 |
-
|
| 280 |
s, e = 0, len(boxes)
|
| 281 |
for i in range(s, e):
|
| 282 |
ov = Recognizer.overlapped_area(box, boxes[i])
|
| 283 |
_ov = Recognizer.overlapped_area(boxes[i], box)
|
| 284 |
-
if (ov, _ov) < (
|
| 285 |
continue
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
|
| 290 |
-
return
|
| 291 |
|
| 292 |
def preprocess(self, image_list):
|
| 293 |
inputs = []
|
|
|
|
| 276 |
def find_overlapped_with_threashold(box, boxes, thr=0.3):
|
| 277 |
if not boxes:
|
| 278 |
return
|
| 279 |
+
max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
|
| 280 |
s, e = 0, len(boxes)
|
| 281 |
for i in range(s, e):
|
| 282 |
ov = Recognizer.overlapped_area(box, boxes[i])
|
| 283 |
_ov = Recognizer.overlapped_area(boxes[i], box)
|
| 284 |
+
if (ov, _ov) < (max_overlapped, _max_overlapped):
|
| 285 |
continue
|
| 286 |
+
max_overlapped_i = i
|
| 287 |
+
max_overlapped = ov
|
| 288 |
+
_max_overlapped = _ov
|
| 289 |
|
| 290 |
+
return max_overlapped_i
|
| 291 |
|
| 292 |
def preprocess(self, image_list):
|
| 293 |
inputs = []
|
rag/app/naive.py
CHANGED
|
@@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
| 101 |
d = copy.deepcopy(doc)
|
| 102 |
if pdf_parser:
|
| 103 |
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
| 104 |
-
add_positions(d, poss)
|
| 105 |
ck = pdf_parser.remove_tag(ck)
|
| 106 |
tokenize(d, ck, eng)
|
| 107 |
res.append(d)
|
|
@@ -112,7 +112,7 @@ if __name__ == "__main__":
|
|
| 112 |
import sys
|
| 113 |
|
| 114 |
|
| 115 |
-
def dummy(
|
| 116 |
pass
|
| 117 |
|
| 118 |
|
|
|
|
| 101 |
d = copy.deepcopy(doc)
|
| 102 |
if pdf_parser:
|
| 103 |
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
| 104 |
+
add_positions(d, poss, from_page)
|
| 105 |
ck = pdf_parser.remove_tag(ck)
|
| 106 |
tokenize(d, ck, eng)
|
| 107 |
res.append(d)
|
|
|
|
| 112 |
import sys
|
| 113 |
|
| 114 |
|
| 115 |
+
def dummy(prog=None, msg=""):
|
| 116 |
pass
|
| 117 |
|
| 118 |
|
rag/nlp/search.py
CHANGED
|
@@ -82,8 +82,8 @@ class Dealer:
|
|
| 82 |
)
|
| 83 |
else:
|
| 84 |
s = s.sort(
|
| 85 |
-
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode"
|
| 86 |
-
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}},
|
| 87 |
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
| 88 |
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
| 89 |
)
|
|
|
|
| 82 |
)
|
| 83 |
else:
|
| 84 |
s = s.sort(
|
| 85 |
+
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
| 86 |
+
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
| 87 |
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
| 88 |
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
| 89 |
)
|