Spaces:
Running
Running
Commit
·
2b9f3af
1
Parent(s):
fa6b4a3
feat: initial commit.
Browse files- .gitignore +2 -0
- README.md +54 -1
- app.py +130 -4
- meta_prompt.py +26 -0
- requirements.txt +6 -0
- utils.py +373 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
*.pyc
|
README.md
CHANGED
@@ -9,5 +9,58 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
license: unknown
|
11 |
---
|
|
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
pinned: false
|
10 |
license: unknown
|
11 |
---
|
12 |
+
## Typhoon OCR
|
13 |
|
14 |
+
Typhoon OCR is a simple Gradio web app for extracting structured markdown from PDFs or images using an OpenAI-compatible vision-language model. It supports document layout analysis and table extraction, returning results in markdown or HTML.
|
15 |
+
|
16 |
+
### Features
|
17 |
+
- Upload a PDF or image (single page)
|
18 |
+
- Extracts and reconstructs document content as markdown
|
19 |
+
- Supports different prompt modes for layout or structure
|
20 |
+
- Uses a local or remote OpenAI-compatible API (e.g., vllm)
|
21 |
+
|
22 |
+
### Install
|
23 |
+
```bash
|
24 |
+
pip install -r requirements.txt
|
25 |
+
# edit .env
|
26 |
+
# pip install vllm # optional for hosting a local server
|
27 |
+
```
|
28 |
+
|
29 |
+
### Mac specific
|
30 |
+
```
|
31 |
+
brew install poppler
|
32 |
+
# The following binaries are required and provided by poppler:
|
33 |
+
# - pdfinfo
|
34 |
+
# - pdftoppm
|
35 |
+
```
|
36 |
+
### Linux specific
|
37 |
+
```
|
38 |
+
sudo apt-get update
|
39 |
+
sudo apt-get install poppler-utils
|
40 |
+
# The following binaries are required and provided by poppler-utils:
|
41 |
+
# - pdfinfo
|
42 |
+
# - pdftoppm
|
43 |
+
```
|
44 |
+
|
45 |
+
|
46 |
+
### Start vllm
|
47 |
+
```bash
|
48 |
+
vllm serve scb10x/typhoon-ocr-7b --served-model-name typhoon-ocr --dtype bfloat16 --port 8101
|
49 |
+
```
|
50 |
+
|
51 |
+
### Run Gradio demo
|
52 |
+
```bash
|
53 |
+
python app.py
|
54 |
+
```
|
55 |
+
|
56 |
+
### Dependencies
|
57 |
+
- openai
|
58 |
+
- python-dotenv
|
59 |
+
- ftfy
|
60 |
+
- pypdf
|
61 |
+
- gradio
|
62 |
+
- vllm (for hosting an inference server)
|
63 |
+
- pillow
|
64 |
+
|
65 |
+
### License
|
66 |
+
This project is licensed under the Apache 2.0 License. See individual datasets and checkpoints for their respective licenses.
|
app.py
CHANGED
@@ -1,7 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
-
|
4 |
-
return "Hello " + name + "!!"
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
from io import BytesIO
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
from meta_prompt import get_prompt
|
6 |
+
from openai import OpenAI
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from utils import render_pdf_to_base64png, image_to_pdf, get_anchor_text
|
9 |
import gradio as gr
|
10 |
+
from PIL import Image
|
11 |
|
12 |
+
load_dotenv()
|
|
|
13 |
|
14 |
+
openai = OpenAI(base_url=os.getenv("TYPHOON_BASE_URL"), api_key=os.getenv("TYPHOON_API_KEY"))
|
15 |
+
|
16 |
+
theme = gr.themes.Soft(
|
17 |
+
primary_hue=gr.themes.Color(
|
18 |
+
c50="#f7f7fd",
|
19 |
+
c100="#dfdef8",
|
20 |
+
c200="#c4c1f2",
|
21 |
+
c300="#a29eea",
|
22 |
+
c400="#8f8ae6",
|
23 |
+
c500="#756fe0",
|
24 |
+
c600="#635cc1",
|
25 |
+
c700="#4f4a9b",
|
26 |
+
c800="#433f83",
|
27 |
+
c900="#302d5e",
|
28 |
+
c950="#302d5e",
|
29 |
+
),
|
30 |
+
secondary_hue="rose",
|
31 |
+
neutral_hue="stone",
|
32 |
+
)
|
33 |
+
|
34 |
+
def process_pdf(pdf_or_image_file, task_type):
|
35 |
+
if pdf_or_image_file is None:
|
36 |
+
return None, "No file uploaded"
|
37 |
+
|
38 |
+
orig_filename = pdf_or_image_file.name
|
39 |
+
ext = os.path.splitext(orig_filename)[1].lower()
|
40 |
+
filename = orig_filename # default to original file if PDF
|
41 |
+
|
42 |
+
# If the file is not a PDF, assume it's an image and convert it to PDF.
|
43 |
+
if ext not in [".pdf"]:
|
44 |
+
filename = image_to_pdf(orig_filename)
|
45 |
+
if filename is None:
|
46 |
+
return None, "Error converting image to PDF"
|
47 |
+
|
48 |
+
# Render the first page to base64 PNG and then load it into a PIL image.
|
49 |
+
image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1800)
|
50 |
+
image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
|
51 |
+
|
52 |
+
# Extract anchor text from the PDF (first page)
|
53 |
+
anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=8000)
|
54 |
+
|
55 |
+
# Retrieve and fill in the prompt template with the anchor_text
|
56 |
+
prompt_template_fn = get_prompt(task_type)
|
57 |
+
PROMPT = prompt_template_fn(anchor_text)
|
58 |
+
|
59 |
+
# Create a messages structure including text and image URL
|
60 |
+
messages = [{
|
61 |
+
"role": "user",
|
62 |
+
"content": [
|
63 |
+
{"type": "text", "text": PROMPT},
|
64 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
65 |
+
],
|
66 |
+
}]
|
67 |
+
# send messages to openai compatible api
|
68 |
+
response = openai.chat.completions.create(
|
69 |
+
model=os.getenv("TYPHOON_OCR_MODEL"),
|
70 |
+
messages=messages,
|
71 |
+
max_tokens=16384,
|
72 |
+
extra_body={
|
73 |
+
"repetition_penalty": 1.2,
|
74 |
+
"temperature": 0.1,
|
75 |
+
"top_p": 0.6,
|
76 |
+
},
|
77 |
+
|
78 |
+
)
|
79 |
+
text_output = response.choices[0].message.content
|
80 |
+
|
81 |
+
# Try to parse the output assuming it is a Python dictionary containing 'natural_text'
|
82 |
+
try:
|
83 |
+
json_data = json.loads(text_output)
|
84 |
+
markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
|
85 |
+
except Exception as e:
|
86 |
+
markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
|
87 |
+
|
88 |
+
return image_pil, markdown_out
|
89 |
+
|
90 |
+
|
91 |
+
# Build the Gradio UI.
|
92 |
+
with gr.Blocks(theme=theme) as demo:
|
93 |
+
title = gr.HTML("""
|
94 |
+
<h1>Typhoon OCR</h1>
|
95 |
+
<ul>
|
96 |
+
<li>🤗 <b>Model weights</b>: <a href="https://huggingface.co/scb10x/typhoon-ocr-7b" target="_blank">https://huggingface.co/scb10x/typhoon-ocr-7b</a></li>
|
97 |
+
</ul>
|
98 |
+
<br />
|
99 |
+
<details>
|
100 |
+
<summary><strong>Disclaimer</strong></summary>
|
101 |
+
The responses generated by this Artificial Intelligence (AI) system are autonomously constructed and do not necessarily reflect the views or positions of the developing organizations, their affiliates, or any of their employees. These AI-generated responses do not represent those of the organizations. The organizations do not endorse, support, sanction, encourage, verify, or agree with the comments, opinions, or statements generated by this AI. The information produced by this AI is not intended to malign any religion, ethnic group, club, organization, company, individual, anyone, or anything. It is not the intent of the organizations to malign any group or individual. The AI operates based on its programming and training data and its responses should not be interpreted as the explicit intent or opinion of the organizations.
|
102 |
+
</details>
|
103 |
+
<br />
|
104 |
+
<details>
|
105 |
+
<summary><strong>Terms of use</strong></summary>
|
106 |
+
By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. Vision language models are prone to hallucinations to a greater extent compared to text-only LLMs.
|
107 |
+
</details>
|
108 |
+
<br />
|
109 |
+
<details>
|
110 |
+
<summary><strong>License</strong></summary>
|
111 |
+
This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the Apache license 2.0.
|
112 |
+
</details>
|
113 |
+
""")
|
114 |
+
with gr.Row():
|
115 |
+
with gr.Column(scale=1):
|
116 |
+
# Update file_types to accept PDF as well as common image formats.
|
117 |
+
pdf_input = gr.File(label="📄 Upload PDF (Only first page will be processed)", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
|
118 |
+
task_dropdown = gr.Dropdown(["default", "structure"], label="🎯 Select Task", value="default")
|
119 |
+
run_button = gr.Button("🚀 Run")
|
120 |
+
image_output = gr.Image(label="📸 Preview Image (Page 1)", type="pil")
|
121 |
+
with gr.Column(scale=2):
|
122 |
+
markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
|
123 |
+
|
124 |
+
|
125 |
+
# Connect the UI inputs to the processing function.
|
126 |
+
run_button.click(
|
127 |
+
fn=process_pdf,
|
128 |
+
inputs=[pdf_input, task_dropdown],
|
129 |
+
outputs=[image_output, markdown_output]
|
130 |
+
)
|
131 |
+
|
132 |
+
# Launch the Gradio demo (temporary public share for 72 hours)
|
133 |
+
demo.launch(share=False)
|
meta_prompt.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable
|
2 |
+
|
3 |
+
PROMPTS_SYS = {
|
4 |
+
"default": lambda base_text: (f"Below is an image of a document page along with its dimensions. "
|
5 |
+
f"Simply return the markdown representation of this document, presenting tables in markdown format as they naturally appear.\n"
|
6 |
+
f"If the document contains images, use a placeholder like dummy.png for each image.\n"
|
7 |
+
f"Your final output must be in JSON format with a single key `natural_text` containing the response.\n"
|
8 |
+
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"),
|
9 |
+
"structure": lambda base_text: (
|
10 |
+
f"Below is an image of a document page, along with its dimensions and possibly some raw textual content previously extracted from it. "
|
11 |
+
f"Note that the text extraction may be incomplete or partially missing. Carefully consider both the layout and any available text to reconstruct the document accurately.\n"
|
12 |
+
f"Your task is to return the markdown representation of this document, presenting tables in HTML format as they naturally appear.\n"
|
13 |
+
f"If the document contains images or figures, analyze them and include the tag <figure>IMAGE_ANALYSIS</figure> in the appropriate location.\n"
|
14 |
+
f"Your final output must be in JSON format with a single key `natural_text` containing the response.\n"
|
15 |
+
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
16 |
+
),
|
17 |
+
}
|
18 |
+
|
19 |
+
def get_prompt(prompt_name: str) -> Callable[[str], str]:
|
20 |
+
"""
|
21 |
+
Fetches the system prompt based on the provided PROMPT_NAME.
|
22 |
+
|
23 |
+
:param prompt_name: The identifier for the desired prompt.
|
24 |
+
:return: The system prompt as a string.
|
25 |
+
"""
|
26 |
+
return PROMPTS_SYS.get(prompt_name, lambda x: "Invalid PROMPT_NAME provided.")
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai
|
2 |
+
python-dotenv
|
3 |
+
ftfy
|
4 |
+
pypdf
|
5 |
+
gradio
|
6 |
+
pillow
|
utils.py
ADDED
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This code is copied from https://github.com/allenai/olmocr
|
3 |
+
Under the Apache 2.0 license.
|
4 |
+
All credit goes to the original authors.
|
5 |
+
"""
|
6 |
+
from dataclasses import dataclass
|
7 |
+
import re
|
8 |
+
import tempfile
|
9 |
+
from PIL import Image
|
10 |
+
import subprocess
|
11 |
+
import base64
|
12 |
+
from typing import List, Literal
|
13 |
+
import random
|
14 |
+
import ftfy
|
15 |
+
from pypdf.generic import RectangleObject
|
16 |
+
from pypdf import PdfReader
|
17 |
+
|
18 |
+
@dataclass(frozen=True)
|
19 |
+
class Element:
|
20 |
+
pass
|
21 |
+
|
22 |
+
|
23 |
+
@dataclass(frozen=True)
|
24 |
+
class BoundingBox:
|
25 |
+
x0: float
|
26 |
+
y0: float
|
27 |
+
x1: float
|
28 |
+
y1: float
|
29 |
+
|
30 |
+
@staticmethod
|
31 |
+
def from_rectangle(rect: RectangleObject) -> "BoundingBox":
|
32 |
+
return BoundingBox(rect[0], rect[1], rect[2], rect[3])
|
33 |
+
|
34 |
+
|
35 |
+
@dataclass(frozen=True)
|
36 |
+
class TextElement(Element):
|
37 |
+
text: str
|
38 |
+
x: float
|
39 |
+
y: float
|
40 |
+
|
41 |
+
|
42 |
+
@dataclass(frozen=True)
|
43 |
+
class ImageElement(Element):
|
44 |
+
name: str
|
45 |
+
bbox: BoundingBox
|
46 |
+
|
47 |
+
|
48 |
+
@dataclass(frozen=True)
|
49 |
+
class PageReport:
|
50 |
+
mediabox: BoundingBox
|
51 |
+
text_elements: List[TextElement]
|
52 |
+
image_elements: List[ImageElement]
|
53 |
+
|
54 |
+
def image_to_pdf(image_path):
|
55 |
+
try:
|
56 |
+
# Open the image file.
|
57 |
+
img = Image.open(image_path)
|
58 |
+
# Create a temporary file to store the PDF.
|
59 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
60 |
+
filename = tmp.name
|
61 |
+
temp_pdf_created = True
|
62 |
+
# Convert image to RGB if necessary and save as PDF.
|
63 |
+
if img.mode != "RGB":
|
64 |
+
img = img.convert("RGB")
|
65 |
+
img.save(filename, "PDF")
|
66 |
+
return filename
|
67 |
+
except Exception as conv_err:
|
68 |
+
return None
|
69 |
+
|
70 |
+
def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:
|
71 |
+
"""
|
72 |
+
Get the MediaBox dimensions for a specific page in a PDF file using the pdfinfo command.
|
73 |
+
|
74 |
+
:param pdf_file: Path to the PDF file
|
75 |
+
:param page_num: The page number for which to extract MediaBox dimensions
|
76 |
+
:return: A dictionary containing MediaBox dimensions or None if not found
|
77 |
+
"""
|
78 |
+
# Construct the pdfinfo command to extract info for the specific page
|
79 |
+
command = ["pdfinfo", "-f", str(page_num), "-l", str(page_num), "-box", "-enc", "UTF-8", local_pdf_path]
|
80 |
+
|
81 |
+
# Run the command using subprocess
|
82 |
+
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
83 |
+
|
84 |
+
# Check if there is any error in executing the command
|
85 |
+
if result.returncode != 0:
|
86 |
+
raise ValueError(f"Error running pdfinfo: {result.stderr}")
|
87 |
+
|
88 |
+
# Parse the output to find MediaBox
|
89 |
+
output = result.stdout
|
90 |
+
|
91 |
+
for line in output.splitlines():
|
92 |
+
if "MediaBox" in line:
|
93 |
+
media_box_str: List[str] = line.split(":")[1].strip().split()
|
94 |
+
media_box: List[float] = [float(x) for x in media_box_str]
|
95 |
+
return abs(media_box[0] - media_box[2]), abs(media_box[3] - media_box[1])
|
96 |
+
|
97 |
+
raise ValueError("MediaBox not found in the PDF info.")
|
98 |
+
|
99 |
+
def render_pdf_to_base64png(local_pdf_path: str, page_num: int, target_longest_image_dim: int = 2048) -> str:
|
100 |
+
longest_dim = max(get_pdf_media_box_width_height(local_pdf_path, page_num))
|
101 |
+
|
102 |
+
# Convert PDF page to PNG using pdftoppm
|
103 |
+
pdftoppm_result = subprocess.run(
|
104 |
+
[
|
105 |
+
"pdftoppm",
|
106 |
+
"-png",
|
107 |
+
"-f",
|
108 |
+
str(page_num),
|
109 |
+
"-l",
|
110 |
+
str(page_num),
|
111 |
+
"-r",
|
112 |
+
str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
|
113 |
+
local_pdf_path,
|
114 |
+
],
|
115 |
+
timeout=120,
|
116 |
+
stdout=subprocess.PIPE,
|
117 |
+
stderr=subprocess.PIPE,
|
118 |
+
)
|
119 |
+
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
|
120 |
+
return base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
|
121 |
+
|
122 |
+
|
123 |
+
def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
|
124 |
+
result = ""
|
125 |
+
result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"
|
126 |
+
|
127 |
+
if max_length < 20:
|
128 |
+
return result
|
129 |
+
|
130 |
+
images = _merge_image_elements(report.image_elements)
|
131 |
+
|
132 |
+
# Process image elements
|
133 |
+
image_strings = []
|
134 |
+
for element in images:
|
135 |
+
image_str = f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]\n"
|
136 |
+
# Use element's unique identifier (e.g., id or position) for comparison
|
137 |
+
image_strings.append((element, image_str))
|
138 |
+
|
139 |
+
# Process text elements
|
140 |
+
text_strings = []
|
141 |
+
for element in report.text_elements: # type: ignore
|
142 |
+
if len(element.text.strip()) == 0: # type: ignore
|
143 |
+
continue
|
144 |
+
|
145 |
+
element_text = _cleanup_element_text(element.text) # type: ignore
|
146 |
+
text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}\n" # type: ignore
|
147 |
+
text_strings.append((element, text_str))
|
148 |
+
|
149 |
+
# Combine all elements with their positions for sorting
|
150 |
+
all_elements: list[tuple[str, ImageElement, str, tuple[float, float]]] = []
|
151 |
+
for elem, s in image_strings:
|
152 |
+
position = (elem.bbox.x0, elem.bbox.y0)
|
153 |
+
all_elements.append(("image", elem, s, position))
|
154 |
+
for elem, s in text_strings:
|
155 |
+
position = (elem.x, elem.y) # type: ignore
|
156 |
+
all_elements.append(("text", elem, s, position))
|
157 |
+
|
158 |
+
# Calculate total length
|
159 |
+
total_length = len(result) + sum(len(s) for _, _, s, _ in all_elements)
|
160 |
+
|
161 |
+
if total_length <= max_length:
|
162 |
+
# Include all elements
|
163 |
+
for _, _, s, _ in all_elements:
|
164 |
+
result += s
|
165 |
+
return result
|
166 |
+
|
167 |
+
# Identify elements with min/max coordinates
|
168 |
+
edge_elements = set()
|
169 |
+
|
170 |
+
if images:
|
171 |
+
min_x0_image = min(images, key=lambda e: e.bbox.x0)
|
172 |
+
max_x1_image = max(images, key=lambda e: e.bbox.x1)
|
173 |
+
min_y0_image = min(images, key=lambda e: e.bbox.y0)
|
174 |
+
max_y1_image = max(images, key=lambda e: e.bbox.y1)
|
175 |
+
edge_elements.update([min_x0_image, max_x1_image, min_y0_image, max_y1_image])
|
176 |
+
|
177 |
+
if report.text_elements:
|
178 |
+
text_elements = [e for e in report.text_elements if len(e.text.strip()) > 0]
|
179 |
+
if text_elements:
|
180 |
+
min_x_text = min(text_elements, key=lambda e: e.x)
|
181 |
+
max_x_text = max(text_elements, key=lambda e: e.x)
|
182 |
+
min_y_text = min(text_elements, key=lambda e: e.y)
|
183 |
+
max_y_text = max(text_elements, key=lambda e: e.y)
|
184 |
+
edge_elements.update([min_x_text, max_x_text, min_y_text, max_y_text]) # type: ignore
|
185 |
+
|
186 |
+
# Keep track of element IDs to prevent duplication
|
187 |
+
selected_element_ids = set()
|
188 |
+
selected_elements = []
|
189 |
+
|
190 |
+
# Include edge elements first
|
191 |
+
for elem_type, elem, s, position in all_elements:
|
192 |
+
if elem in edge_elements and id(elem) not in selected_element_ids:
|
193 |
+
selected_elements.append((elem_type, elem, s, position))
|
194 |
+
selected_element_ids.add(id(elem))
|
195 |
+
|
196 |
+
# Calculate remaining length
|
197 |
+
current_length = len(result) + sum(len(s) for _, _, s, _ in selected_elements)
|
198 |
+
_remaining_length = max_length - current_length
|
199 |
+
|
200 |
+
# Exclude edge elements from the pool
|
201 |
+
remaining_elements = [(elem_type, elem, s, position) for elem_type, elem, s, position in all_elements if id(elem) not in selected_element_ids]
|
202 |
+
|
203 |
+
# Sort remaining elements by their positions (e.g., x-coordinate and then y-coordinate)
|
204 |
+
# remaining_elements.sort(key=lambda x: (x[3][0], x[3][1]))
|
205 |
+
|
206 |
+
# Shuffle remaining elements randomly
|
207 |
+
random.shuffle(remaining_elements)
|
208 |
+
|
209 |
+
# Add elements until reaching max_length
|
210 |
+
for elem_type, elem, s, position in remaining_elements:
|
211 |
+
if current_length + len(s) > max_length:
|
212 |
+
break
|
213 |
+
selected_elements.append((elem_type, elem, s, position))
|
214 |
+
selected_element_ids.add(id(elem))
|
215 |
+
current_length += len(s)
|
216 |
+
|
217 |
+
# Sort selected elements by their positions to maintain logical order
|
218 |
+
selected_elements.sort(key=lambda x: (x[3][0], x[3][1]))
|
219 |
+
|
220 |
+
# Build the final result
|
221 |
+
for _, _, s, _ in selected_elements:
|
222 |
+
result += s
|
223 |
+
|
224 |
+
return result
|
225 |
+
|
226 |
+
|
227 |
+
def _cap_split_string(text: str, max_length: int) -> str:
|
228 |
+
if len(text) <= max_length:
|
229 |
+
return text
|
230 |
+
|
231 |
+
head_length = max_length // 2 - 3
|
232 |
+
tail_length = head_length
|
233 |
+
|
234 |
+
head = text[:head_length].rsplit(" ", 1)[0] or text[:head_length]
|
235 |
+
tail = text[-tail_length:].split(" ", 1)[-1] or text[-tail_length:]
|
236 |
+
|
237 |
+
return f"{head} ... {tail}"
|
238 |
+
|
239 |
+
|
240 |
+
def _cleanup_element_text(element_text: str) -> str:
|
241 |
+
MAX_TEXT_ELEMENT_LENGTH = 250
|
242 |
+
TEXT_REPLACEMENTS = {"[": "\\[", "]": "\\]", "\n": "\\n", "\r": "\\r", "\t": "\\t"}
|
243 |
+
text_replacement_pattern = re.compile("|".join(re.escape(key) for key in TEXT_REPLACEMENTS.keys()))
|
244 |
+
|
245 |
+
element_text = ftfy.fix_text(element_text).strip()
|
246 |
+
|
247 |
+
# Replace square brackets with escaped brackets and other escaped chars
|
248 |
+
element_text = text_replacement_pattern.sub(lambda match: TEXT_REPLACEMENTS[match.group(0)], element_text)
|
249 |
+
|
250 |
+
return _cap_split_string(element_text, MAX_TEXT_ELEMENT_LENGTH)
|
251 |
+
|
252 |
+
def _merge_image_elements(images: List[ImageElement], tolerance: float = 0.5) -> List[ImageElement]:
|
253 |
+
n = len(images)
|
254 |
+
parent = list(range(n)) # Initialize Union-Find parent pointers
|
255 |
+
|
256 |
+
def find(i):
|
257 |
+
# Find with path compression
|
258 |
+
root = i
|
259 |
+
while parent[root] != root:
|
260 |
+
root = parent[root]
|
261 |
+
while parent[i] != i:
|
262 |
+
parent_i = parent[i]
|
263 |
+
parent[i] = root
|
264 |
+
i = parent_i
|
265 |
+
return root
|
266 |
+
|
267 |
+
def union(i, j):
|
268 |
+
# Union by attaching root of one tree to another
|
269 |
+
root_i = find(i)
|
270 |
+
root_j = find(j)
|
271 |
+
if root_i != root_j:
|
272 |
+
parent[root_i] = root_j
|
273 |
+
|
274 |
+
def bboxes_overlap(b1: BoundingBox, b2: BoundingBox, tolerance: float) -> bool:
|
275 |
+
# Compute horizontal and vertical distances between boxes
|
276 |
+
h_dist = max(0, max(b1.x0, b2.x0) - min(b1.x1, b2.x1))
|
277 |
+
v_dist = max(0, max(b1.y0, b2.y0) - min(b1.y1, b2.y1))
|
278 |
+
# Check if distances are within tolerance
|
279 |
+
return h_dist <= tolerance and v_dist <= tolerance
|
280 |
+
|
281 |
+
# Union overlapping images
|
282 |
+
for i in range(n):
|
283 |
+
for j in range(i + 1, n):
|
284 |
+
if bboxes_overlap(images[i].bbox, images[j].bbox, tolerance):
|
285 |
+
union(i, j)
|
286 |
+
|
287 |
+
# Group images by their root parent
|
288 |
+
groups: dict[int, list[int]] = {}
|
289 |
+
for i in range(n):
|
290 |
+
root = find(i)
|
291 |
+
groups.setdefault(root, []).append(i)
|
292 |
+
|
293 |
+
# Merge images in the same group
|
294 |
+
merged_images = []
|
295 |
+
for indices in groups.values():
|
296 |
+
# Initialize merged bounding box
|
297 |
+
merged_bbox = images[indices[0]].bbox
|
298 |
+
merged_name = images[indices[0]].name
|
299 |
+
|
300 |
+
for idx in indices[1:]:
|
301 |
+
bbox = images[idx].bbox
|
302 |
+
# Expand merged_bbox to include the current bbox
|
303 |
+
merged_bbox = BoundingBox(
|
304 |
+
x0=min(merged_bbox.x0, bbox.x0),
|
305 |
+
y0=min(merged_bbox.y0, bbox.y0),
|
306 |
+
x1=max(merged_bbox.x1, bbox.x1),
|
307 |
+
y1=max(merged_bbox.y1, bbox.y1),
|
308 |
+
)
|
309 |
+
# Optionally, update the name
|
310 |
+
merged_name += f"+{images[idx].name}"
|
311 |
+
|
312 |
+
merged_images.append(ImageElement(name=merged_name, bbox=merged_bbox))
|
313 |
+
|
314 |
+
# Return the merged images along with other elements
|
315 |
+
return merged_images
|
316 |
+
|
317 |
+
def _transform_point(x, y, m):
|
318 |
+
x_new = m[0] * x + m[2] * y + m[4]
|
319 |
+
y_new = m[1] * x + m[3] * y + m[5]
|
320 |
+
return x_new, y_new
|
321 |
+
|
322 |
+
def _mult(m: List[float], n: List[float]) -> List[float]:
|
323 |
+
return [
|
324 |
+
m[0] * n[0] + m[1] * n[2],
|
325 |
+
m[0] * n[1] + m[1] * n[3],
|
326 |
+
m[2] * n[0] + m[3] * n[2],
|
327 |
+
m[2] * n[1] + m[3] * n[3],
|
328 |
+
m[4] * n[0] + m[5] * n[2] + n[4],
|
329 |
+
m[4] * n[1] + m[5] * n[3] + n[5],
|
330 |
+
]
|
331 |
+
|
332 |
+
def _pdf_report(local_pdf_path: str, page_num: int) -> PageReport:
|
333 |
+
reader = PdfReader(local_pdf_path)
|
334 |
+
page = reader.pages[page_num - 1]
|
335 |
+
resources = page.get("/Resources", {})
|
336 |
+
xobjects = resources.get("/XObject", {})
|
337 |
+
text_elements, image_elements = [], []
|
338 |
+
|
339 |
+
def visitor_body(text, cm, tm, font_dict, font_size):
|
340 |
+
txt2user = _mult(tm, cm)
|
341 |
+
text_elements.append(TextElement(text, txt2user[4], txt2user[5]))
|
342 |
+
|
343 |
+
def visitor_op(op, args, cm, tm):
|
344 |
+
if op == b"Do":
|
345 |
+
xobject_name = args[0]
|
346 |
+
xobject = xobjects.get(xobject_name)
|
347 |
+
if xobject and xobject["/Subtype"] == "/Image":
|
348 |
+
# Compute image bbox
|
349 |
+
# The image is placed according to the CTM
|
350 |
+
_width = xobject.get("/Width")
|
351 |
+
_height = xobject.get("/Height")
|
352 |
+
x0, y0 = _transform_point(0, 0, cm)
|
353 |
+
x1, y1 = _transform_point(1, 1, cm)
|
354 |
+
image_elements.append(ImageElement(xobject_name, BoundingBox(min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))))
|
355 |
+
|
356 |
+
page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
|
357 |
+
|
358 |
+
return PageReport(
|
359 |
+
mediabox=BoundingBox.from_rectangle(page.mediabox),
|
360 |
+
text_elements=text_elements,
|
361 |
+
image_elements=image_elements,
|
362 |
+
)
|
363 |
+
|
364 |
+
def get_anchor_text(
|
365 |
+
local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pypdf", "topcoherency", "pdfreport"], target_length: int = 4000
|
366 |
+
) -> str:
|
367 |
+
assert page > 0, "Pages are 1-indexed in pdf-land"
|
368 |
+
|
369 |
+
|
370 |
+
if pdf_engine == "pdfreport":
|
371 |
+
return _linearize_pdf_report(_pdf_report(local_pdf_path, page), max_length=target_length)
|
372 |
+
else:
|
373 |
+
raise NotImplementedError("Unknown engine")
|