Spaces:
Configuration error
api v1alpha1 (#17)
Browse files* api v1alpha1
Signed-off-by: Guillaume Moutier <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
* use actual types in request models and refactor
Signed-off-by: Michele Dolfi <[email protected]>
* make gradio optional and update README
Signed-off-by: Michele Dolfi <[email protected]>
* Run workflow jobs sequentially to avoid disk space outage (#19)
Github Action runners are running out of the space while
building both the images in parallel.
This change will build the image sequentially and also
clean up the cpu images before start building gpu image.
Signed-off-by: Anil Vishnoi <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
* Add github job to build image (and not publish) on PR creation (#20)
Signed-off-by: Anil Vishnoi <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
* add start_server script for local dev
Signed-off-by: Michele Dolfi <[email protected]>
* fix 3.12-only syntax
Signed-off-by: Michele Dolfi <[email protected]>
* fix more py3.10-11 compatibility
Signed-off-by: Michele Dolfi <[email protected]>
* rework output format and background tasks
Signed-off-by: Michele Dolfi <[email protected]>
* speficy return schemas for openapi
Signed-off-by: Michele Dolfi <[email protected]>
* add processing time and update REDAME
Signed-off-by: Michele Dolfi <[email protected]>
* lint markdown
Signed-off-by: Michele Dolfi <[email protected]>
* add MD033 to config
Signed-off-by: Michele Dolfi <[email protected]>
* use port 5000
Signed-off-by: Michele Dolfi <[email protected]>
* use port 5001 as default
Signed-off-by: Michele Dolfi <[email protected]>
* update deps
Signed-off-by: Michele Dolfi <[email protected]>
* refactor input request
Signed-off-by: Michele Dolfi <[email protected]>
* return docling document
Signed-off-by: Michele Dolfi <[email protected]>
* update new payload in README
Signed-off-by: Michele Dolfi <[email protected]>
* add base64 example
Signed-off-by: Michele Dolfi <[email protected]>
* wrap example in <details>
Signed-off-by: Michele Dolfi <[email protected]>
* rename /url in /source
Signed-off-by: Michele Dolfi <[email protected]>
* move main execution to __main__
Signed-off-by: Michele Dolfi <[email protected]>
---------
Signed-off-by: Guillaume Moutier <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
Signed-off-by: Anil Vishnoi <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
Co-authored-by: Anil Vishnoi <[email protected]>
- .dockerignore +40 -0
- .markdownlint-cli2.yaml +2 -0
- .pre-commit-config.yaml +8 -0
- Containerfile +1 -1
- Makefile +2 -2
- README.md +332 -27
- docling_serve/.env.example +3 -0
- docling_serve/__main__.py +20 -0
- docling_serve/app.py +167 -222
- docling_serve/docling_conversion.py +400 -0
- docling_serve/gradio_ui.py +635 -0
- docling_serve/helper_functions.py +62 -0
- docling_serve/response_preparation.py +248 -0
- img/swagger.png +3 -0
- img/ui-input.png +3 -0
- img/ui-output.png +3 -0
- poetry.lock +0 -0
- pyproject.toml +27 -4
- start_server.sh +30 -0
- tests/2206.01062v1.pdf +3 -0
- tests/2408.09869v5.pdf +3 -0
- tests/test_1-file-all-outputs.py +129 -0
- tests/test_1-url-all-outputs.py +123 -0
- tests/test_2-files-all-outputs.py +74 -0
- tests/test_2-urls-all-outputs.py +67 -0
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore Python cache files
|
2 |
+
__pycache__/
|
3 |
+
**/__pycache__/
|
4 |
+
*.pyc
|
5 |
+
*.pyo
|
6 |
+
*.pyd
|
7 |
+
|
8 |
+
# Ignore virtual environments
|
9 |
+
env/
|
10 |
+
venv/
|
11 |
+
|
12 |
+
# Ignore development artifacts
|
13 |
+
*.log
|
14 |
+
*.db
|
15 |
+
*.sqlite3
|
16 |
+
|
17 |
+
# Ignore configuration and sensitive files
|
18 |
+
**/.env
|
19 |
+
*.env
|
20 |
+
*.ini
|
21 |
+
*.cfg
|
22 |
+
|
23 |
+
# Ignore IDE and editor settings
|
24 |
+
.vscode/
|
25 |
+
.idea/
|
26 |
+
*.swp
|
27 |
+
*.swo
|
28 |
+
|
29 |
+
# Ignore Git files
|
30 |
+
.git/
|
31 |
+
.gitignore
|
32 |
+
|
33 |
+
# Ignore Docker files themselves (optional if not needed in the image)
|
34 |
+
.dockerignore
|
35 |
+
Dockerfile*
|
36 |
+
|
37 |
+
# Ignore build artifacts (if applicable)
|
38 |
+
build/
|
39 |
+
dist/
|
40 |
+
*.egg-info
|
@@ -2,5 +2,7 @@ config:
|
|
2 |
line-length: false
|
3 |
no-emphasis-as-header: false
|
4 |
first-line-heading: false
|
|
|
|
|
5 |
globs:
|
6 |
- "**/*.md"
|
|
|
2 |
line-length: false
|
3 |
no-emphasis-as-header: false
|
4 |
first-line-heading: false
|
5 |
+
MD033:
|
6 |
+
allowed_elements: ["details", "summary"]
|
7 |
globs:
|
8 |
- "**/*.md"
|
@@ -16,6 +16,14 @@ repos:
|
|
16 |
pass_filenames: false
|
17 |
language: system
|
18 |
files: '\.py$'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
- repo: local
|
20 |
hooks:
|
21 |
- id: system
|
|
|
16 |
pass_filenames: false
|
17 |
language: system
|
18 |
files: '\.py$'
|
19 |
+
- repo: local
|
20 |
+
hooks:
|
21 |
+
- id: autoflake
|
22 |
+
name: autoflake
|
23 |
+
entry: poetry run autoflake docling_serve tests
|
24 |
+
pass_filenames: false
|
25 |
+
language: system
|
26 |
+
files: '\.py$'
|
27 |
- repo: local
|
28 |
hooks:
|
29 |
- id: system
|
@@ -58,4 +58,4 @@ COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve
|
|
58 |
|
59 |
EXPOSE 5001
|
60 |
|
61 |
-
CMD ["
|
|
|
58 |
|
59 |
EXPOSE 5001
|
60 |
|
61 |
+
CMD ["python", "-m", "docling_serve"]
|
@@ -25,14 +25,14 @@ md-lint-file:
|
|
25 |
$(CMD_PREFIX) touch .markdown-lint
|
26 |
|
27 |
.PHONY: docling-serve-cpu-image
|
28 |
-
docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only"
|
29 |
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve CPU ONLY]"
|
30 |
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=true -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve-cpu:$(TAG) .
|
31 |
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) ghcr.io/ds4sd/docling-serve-cpu:main
|
32 |
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) quay.io/ds4sd/docling-serve-cpu:main
|
33 |
|
34 |
.PHONY: docling-serve-gpu-image
|
35 |
-
docling-serve-gpu-image: Containerfile ## Build docling-serve
|
36 |
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve with GPU]"
|
37 |
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=false -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve:$(TAG) .
|
38 |
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) ghcr.io/ds4sd/docling-serve:main
|
|
|
25 |
$(CMD_PREFIX) touch .markdown-lint
|
26 |
|
27 |
.PHONY: docling-serve-cpu-image
|
28 |
+
docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" container image
|
29 |
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve CPU ONLY]"
|
30 |
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=true -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve-cpu:$(TAG) .
|
31 |
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) ghcr.io/ds4sd/docling-serve-cpu:main
|
32 |
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) quay.io/ds4sd/docling-serve-cpu:main
|
33 |
|
34 |
.PHONY: docling-serve-gpu-image
|
35 |
+
docling-serve-gpu-image: Containerfile ## Build docling-serve container image with GPU support
|
36 |
$(ECHO_PREFIX) printf " %-12s Containerfile\n" "[docling-serve with GPU]"
|
37 |
$(CMD_PREFIX) docker build --build-arg CPU_ONLY=false -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve:$(TAG) .
|
38 |
$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) ghcr.io/ds4sd/docling-serve:main
|
@@ -2,55 +2,360 @@
|
|
2 |
|
3 |
Running [Docling](https://github.com/DS4SD/docling) as an API service.
|
4 |
|
5 |
-
|
6 |
-
> This is an unstable draft implementation which will quickly evolve.
|
7 |
|
8 |
-
|
9 |
|
10 |
-
|
11 |
|
12 |
-
|
13 |
-
# Install poetry if not already available
|
14 |
-
curl -sSL https://install.python-poetry.org | python3 -
|
15 |
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
```
|
22 |
|
23 |
-
|
|
|
|
|
|
|
|
|
24 |
|
25 |
```sh
|
26 |
curl -X 'POST' \
|
27 |
-
'http://
|
28 |
-H 'accept: application/json' \
|
29 |
-H 'Content-Type: application/json' \
|
30 |
-d '{
|
31 |
-
"
|
32 |
-
"
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
}'
|
35 |
```
|
36 |
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
```sh
|
42 |
-
#
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
|
46 |
-
|
|
|
|
|
47 |
|
48 |
-
|
49 |
-
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
|
50 |
|
51 |
-
|
52 |
-
pip install -e .
|
53 |
|
|
|
54 |
# Run the server
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
Running [Docling](https://github.com/DS4SD/docling) as an API service.
|
4 |
|
5 |
+
## Usage
|
|
|
6 |
|
7 |
+
The API provides two endpoints: one for urls, one for files. This is necessary to send files directly in binary format instead of base64-encoded strings.
|
8 |
|
9 |
+
### Common parameters
|
10 |
|
11 |
+
On top of the source of file (see below), both endpoints support the same parameters, which are the same as the Docling CLI.
|
|
|
|
|
12 |
|
13 |
+
- `from_format` (List[str]): Input format(s) to convert from. Allowed values: `docx`, `pptx`, `html`, `image`, `pdf`, `asciidoc`, `md`. Defaults to all formats.
|
14 |
+
- `to_format` (List[str]): Output format(s) to convert to. Allowed values: `md`, `json`, `html`, `text`, `doctags`. Defaults to `md`.
|
15 |
+
- `do_ocr` (bool): If enabled, the bitmap content will be processed using OCR. Defaults to `True`.
|
16 |
+
- `image_export_mode`: Image export mode for the document (only in case of JSON, Markdown or HTML). Allowed values: embedded, placeholder, referenced. Optional, defaults to `embedded`.
|
17 |
+
- `force_ocr` (bool): If enabled, replace any existing text with OCR-generated text over the full content. Defaults to `False`.
|
18 |
+
- `ocr_engine` (str): OCR engine to use. Allowed values: `easyocr`, `tesseract_cli`, `tesseract`, `rapidocr`, `ocrmac`. Defaults to `easyocr`.
|
19 |
+
- `ocr_lang` (List[str]): List of languages used by the OCR engine. Note that each OCR engine has different values for the language names. Defaults to empty.
|
20 |
+
- `pdf_backend` (str): PDF backend to use. Allowed values: `pypdfium2`, `dlparse_v1`, `dlparse_v2`. Defaults to `dlparse_v2`.
|
21 |
+
- `table_mode` (str): Table mode to use. Allowed values: `fast`, `accurate`. Defaults to `fast`.
|
22 |
+
- `abort_on_error` (bool): If enabled, abort on error. Defaults to false.
|
23 |
+
- `return_as_file` (boo): If enabled, return the output as a file. Defaults to false.
|
24 |
+
- `do_table_structure` (bool): If enabled, the table structure will be extracted. Defaults to true.
|
25 |
+
- `include_images` (bool): If enabled, images will be extracted from the document. Defaults to true.
|
26 |
+
- `images_scale` (float): Scale factor for images. Defaults to 2.0.
|
27 |
|
28 |
+
### URL endpoint
|
29 |
+
|
30 |
+
The endpoint is `/v1alpha/convert/source`, listening for POST requests of JSON payloads.
|
31 |
+
|
32 |
+
On top of the above parameters, you must send the URL(s) of the document you want process with either the `http_sources` or `file_sources` fields.
|
33 |
+
The first is fetching URL(s) (optionally using with extra headers), the second allows to provide documents as base64-encoded strings.
|
34 |
+
No `options` is required, they can be partially or completely omitted.
|
35 |
+
|
36 |
+
Simple payload example:
|
37 |
+
|
38 |
+
```json
|
39 |
+
{
|
40 |
+
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
41 |
+
}
|
42 |
+
```
|
43 |
+
|
44 |
+
<details>
|
45 |
+
|
46 |
+
<summary>Complete payload example:</summary>
|
47 |
+
|
48 |
+
```json
|
49 |
+
{
|
50 |
+
"options": {
|
51 |
+
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
52 |
+
"to_formats": ["md", "json", "html", "text", "doctags"],
|
53 |
+
"image_export_mode": "placeholder",
|
54 |
+
"do_ocr": true,
|
55 |
+
"force_ocr": false,
|
56 |
+
"ocr_engine": "easyocr",
|
57 |
+
"ocr_lang": ["en"],
|
58 |
+
"pdf_backend": "dlparse_v2",
|
59 |
+
"table_mode": "fast",
|
60 |
+
"abort_on_error": false,
|
61 |
+
"return_as_file": false,
|
62 |
+
},
|
63 |
+
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
64 |
+
}
|
65 |
```
|
66 |
|
67 |
+
</details>
|
68 |
+
|
69 |
+
<details>
|
70 |
+
|
71 |
+
<summary>CURL example:</summary>
|
72 |
|
73 |
```sh
|
74 |
curl -X 'POST' \
|
75 |
+
'http://localhost:5001/v1alpha/convert/source' \
|
76 |
-H 'accept: application/json' \
|
77 |
-H 'Content-Type: application/json' \
|
78 |
-d '{
|
79 |
+
"options": {
|
80 |
+
"from_formats": [
|
81 |
+
"docx",
|
82 |
+
"pptx",
|
83 |
+
"html",
|
84 |
+
"image",
|
85 |
+
"pdf",
|
86 |
+
"asciidoc",
|
87 |
+
"md",
|
88 |
+
"xlsx"
|
89 |
+
],
|
90 |
+
"to_formats": ["md", "json", "html", "text", "doctags"],
|
91 |
+
"image_export_mode": "placeholder",
|
92 |
+
"do_ocr": true,
|
93 |
+
"force_ocr": false,
|
94 |
+
"ocr_engine": "easyocr",
|
95 |
+
"ocr_lang": [
|
96 |
+
"fr",
|
97 |
+
"de",
|
98 |
+
"es",
|
99 |
+
"en"
|
100 |
+
],
|
101 |
+
"pdf_backend": "dlparse_v2",
|
102 |
+
"table_mode": "fast",
|
103 |
+
"abort_on_error": false,
|
104 |
+
"return_as_file": false,
|
105 |
+
"do_table_structure": true,
|
106 |
+
"include_images": true,
|
107 |
+
"images_scale": 2,
|
108 |
+
},
|
109 |
+
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
110 |
}'
|
111 |
```
|
112 |
|
113 |
+
</details>
|
114 |
+
|
115 |
+
<details>
|
116 |
+
<summary>Python example:</summary>
|
117 |
+
|
118 |
+
```python
|
119 |
+
import httpx
|
120 |
+
|
121 |
+
async_client = httpx.AsyncClient(timeout=60.0)
|
122 |
+
url = "http://localhost:5001/v1alpha/convert/source"
|
123 |
+
payload = {
|
124 |
+
"options": {
|
125 |
+
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
126 |
+
"to_formats": ["md", "json", "html", "text", "doctags"],
|
127 |
+
"image_export_mode": "placeholder",
|
128 |
+
"do_ocr": True,
|
129 |
+
"force_ocr": False,
|
130 |
+
"ocr_engine": "easyocr",
|
131 |
+
"ocr_lang": "en",
|
132 |
+
"pdf_backend": "dlparse_v2",
|
133 |
+
"table_mode": "fast",
|
134 |
+
"abort_on_error": False,
|
135 |
+
"return_as_file": False,
|
136 |
+
},
|
137 |
+
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}]
|
138 |
+
}
|
139 |
+
|
140 |
+
response = await async_client_client.post(url, json=payload)
|
141 |
+
|
142 |
+
data = response.json()
|
143 |
+
```
|
144 |
+
|
145 |
+
</details>
|
146 |
+
|
147 |
+
#### File as base64
|
148 |
+
|
149 |
+
The `file_sources` argument in the endpoint allows to send files as base64-encoded strings.
|
150 |
+
When your PDF or other file type is too large, encoding it and passing it inline to curl
|
151 |
+
can lead to an “Argument list too long” error on some systems. To avoid this, we write
|
152 |
+
the JSON request body to a file and have curl read from that file.
|
153 |
+
|
154 |
+
<details>
|
155 |
+
<summary>CURL steps:</summary>
|
156 |
+
|
157 |
+
```sh
|
158 |
+
# 1. Base64-encode the file
|
159 |
+
B64_DATA=$(base64 -w 0 /path/to/file/pdf-to-convert.pdf)
|
160 |
+
|
161 |
+
# 2. Build the JSON with your options
|
162 |
+
cat <<EOF > /tmp/request_body.json
|
163 |
+
{
|
164 |
+
"options": {
|
165 |
+
},
|
166 |
+
"file_sources": [{
|
167 |
+
"base64_string": "${B64_DATA}",
|
168 |
+
"filename": "pdf-to-convert.pdf"
|
169 |
+
}]
|
170 |
+
}
|
171 |
+
EOF
|
172 |
+
|
173 |
+
# 3. POST the request to the docling service
|
174 |
+
curl -X POST "localhost:5001/v1alpha/convert/source" \
|
175 |
+
-H "Content-Type: application/json" \
|
176 |
+
-d @/tmp/request_body.json
|
177 |
+
```
|
178 |
+
|
179 |
+
</details>
|
180 |
+
|
181 |
+
### File endpoint
|
182 |
+
|
183 |
+
The endpoint is: `/v1alpha/convert/file`, listening for POST requests of Form payloads (necessary as the files are sent as multipart/form data). You can send one or multiple files.
|
184 |
+
|
185 |
+
<details>
|
186 |
+
<summary>CURL example:</summary>
|
187 |
+
|
188 |
+
```sh
|
189 |
+
curl -X 'POST' \
|
190 |
+
'http://127.0.0.1:5001/v1alpha/convert/file' \
|
191 |
+
-H 'accept: application/json' \
|
192 |
+
-H 'Content-Type: multipart/form-data' \
|
193 |
+
-F 'ocr_engine=easyocr' \
|
194 |
+
-F 'pdf_backend=dlparse_v2' \
|
195 |
+
-F 'from_formats=pdf,docx' \
|
196 |
+
-F 'force_ocr=false' \
|
197 |
+
-F 'image_export_mode=embedded' \
|
198 |
+
-F 'ocr_lang=["en"]' \
|
199 |
+
-F 'table_mode=fast' \
|
200 |
+
-F '[email protected];type=application/pdf' \
|
201 |
+
-F 'abort_on_error=false' \
|
202 |
+
-F 'to_formats=md' \
|
203 |
+
-F 'return_as_file=false' \
|
204 |
+
-F 'do_ocr=true'
|
205 |
+
```
|
206 |
+
|
207 |
+
</details>
|
208 |
+
|
209 |
+
<details>
|
210 |
+
<summary>Python example:</summary>
|
211 |
+
|
212 |
+
```python
|
213 |
+
import httpx
|
214 |
+
|
215 |
+
async_client = httpx.AsyncClient(timeout=60.0)
|
216 |
+
url = "http://localhost:5001/v1alpha/convert/file"
|
217 |
+
parameters = {
|
218 |
+
"from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
|
219 |
+
"to_formats": ["md", "json", "html", "text", "doctags"],
|
220 |
+
"image_export_mode": "placeholder",
|
221 |
+
"do_ocr": True,
|
222 |
+
"force_ocr": False,
|
223 |
+
"ocr_engine": "easyocr",
|
224 |
+
"ocr_lang": ["en"],
|
225 |
+
"pdf_backend": "dlparse_v2",
|
226 |
+
"table_mode": "fast",
|
227 |
+
"abort_on_error": False,
|
228 |
+
"return_as_file": False
|
229 |
+
}
|
230 |
+
|
231 |
+
current_dir = os.path.dirname(__file__)
|
232 |
+
file_path = os.path.join(current_dir, '2206.01062v1.pdf')
|
233 |
+
|
234 |
+
files = {
|
235 |
+
'files': ('2206.01062v1.pdf', open(file_path, 'rb'), 'application/pdf'),
|
236 |
+
}
|
237 |
+
|
238 |
+
response = await async_client.post(url, files=files, data={"parameters": json.dumps(parameters)})
|
239 |
+
assert response.status_code == 200, "Response should be 200 OK"
|
240 |
+
|
241 |
+
data = response.json()
|
242 |
+
```
|
243 |
+
|
244 |
+
</details>
|
245 |
+
|
246 |
+
### Response format
|
247 |
+
|
248 |
+
The response can be a JSON Document or a File.
|
249 |
+
|
250 |
+
- If you process only one file, the response will be a JSON document with the following format:
|
251 |
+
|
252 |
+
```jsonc
|
253 |
+
{
|
254 |
+
"document": {
|
255 |
+
"md_content": "",
|
256 |
+
"json_content": {},
|
257 |
+
"html_content": "",
|
258 |
+
"text_content": "",
|
259 |
+
"doctags_content": ""
|
260 |
+
},
|
261 |
+
"status": "<success|partial_success|skipped|failure>",
|
262 |
+
"processing_time": 0.0,
|
263 |
+
"timings": {},
|
264 |
+
"errors": []
|
265 |
+
}
|
266 |
+
```
|
267 |
+
|
268 |
+
Depending on the value you set in `output_formats`, the different items will be populated with their respective results or empty.
|
269 |
+
|
270 |
+
`processing_time` is the Docling processing time in seconds, and `timings` (when enabled in the backend) provides the detailed
|
271 |
+
timing of all the internal Docling components.
|
272 |
+
|
273 |
+
- If you set the parameter `return_as_file` to True, the response will be a zip file.
|
274 |
+
- If multiple files are generated (multiple inputs, or one input but multiple outputs with `return_as_file` True), the response will be a zip file.
|
275 |
|
276 |
+
## Helpers
|
277 |
+
|
278 |
+
- A full Swagger UI is available at the `/docs` endpoint.
|
279 |
+
|
280 |
+

|
281 |
+
|
282 |
+
- An easy to use UI is available at the `/ui` endpoint.
|
283 |
+
|
284 |
+

|
285 |
+
|
286 |
+

|
287 |
+
|
288 |
+
## Development
|
289 |
+
|
290 |
+
### CPU only
|
291 |
|
292 |
```sh
|
293 |
+
# Install poetry if not already available
|
294 |
+
curl -sSL https://install.python-poetry.org | python3 -
|
295 |
+
|
296 |
+
# Install dependencies
|
297 |
+
poetry install --with cpu
|
298 |
+
```
|
299 |
+
|
300 |
+
### Cuda GPU
|
301 |
+
|
302 |
+
For GPU support use the following command:
|
303 |
|
304 |
+
```sh
|
305 |
+
# Install dependencies
|
306 |
+
poetry install
|
307 |
+
```
|
308 |
|
309 |
+
### Run the server
|
|
|
310 |
|
311 |
+
The [start_server.sh](./start_server.sh) executable is a convenient script for launching the local webserver.
|
|
|
312 |
|
313 |
+
```sh
|
314 |
# Run the server
|
315 |
+
bash start_server.sh
|
316 |
+
|
317 |
+
# Run the server with live reload
|
318 |
+
RELOAD=true bash start_server.sh
|
319 |
+
```
|
320 |
+
|
321 |
+
### Environment variables
|
322 |
+
|
323 |
+
The following variables are available:
|
324 |
+
|
325 |
+
`TESSDATA_PREFIX`: Tesseract data location, example `/usr/share/tesseract/tessdata/`.
|
326 |
+
`UVICORN_WORKERS`: Number of workers to use.
|
327 |
+
`RELOAD`: If `True`, this will enable auto-reload when you modify files, useful for development.
|
328 |
+
`WITH_UI`: If `True`, The Gradio UI will be available at `/ui`.
|
329 |
+
|
330 |
+
## Get help and support
|
331 |
+
|
332 |
+
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
333 |
+
|
334 |
+
## Contributing
|
335 |
+
|
336 |
+
Please read [Contributing to Docling Serve](https://github.com/DS4SD/docling-serve/blob/main/CONTRIBUTING.md) for details.
|
337 |
+
|
338 |
+
## References
|
339 |
+
|
340 |
+
If you use Docling in your projects, please consider citing the following:
|
341 |
+
|
342 |
+
```bib
|
343 |
+
@techreport{Docling,
|
344 |
+
author = {Deep Search Team},
|
345 |
+
month = {8},
|
346 |
+
title = {Docling Technical Report},
|
347 |
+
url = {https://arxiv.org/abs/2408.09869},
|
348 |
+
eprint = {2408.09869},
|
349 |
+
doi = {10.48550/arXiv.2408.09869},
|
350 |
+
version = {1.0.0},
|
351 |
+
year = {2024}
|
352 |
+
}
|
353 |
```
|
354 |
+
|
355 |
+
## License
|
356 |
+
|
357 |
+
The Docling Serve codebase is under MIT license.
|
358 |
+
|
359 |
+
## IBM ❤️ Open Source AI
|
360 |
+
|
361 |
+
Docling has been brought to you by IBM.
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
2 |
+
UVICORN_WORKERS=2
|
3 |
+
RELOAD=True
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from docling_serve.app import app
|
4 |
+
from docling_serve.helper_functions import _str_to_bool
|
5 |
+
|
6 |
+
# Launch the FastAPI server
|
7 |
+
if __name__ == "__main__":
|
8 |
+
from uvicorn import run
|
9 |
+
|
10 |
+
port = int(os.getenv("PORT", "5001"))
|
11 |
+
workers = int(os.getenv("UVICORN_WORKERS", "1"))
|
12 |
+
reload = _str_to_bool(os.getenv("RELOAD", "False"))
|
13 |
+
run(
|
14 |
+
app,
|
15 |
+
host="0.0.0.0",
|
16 |
+
port=port,
|
17 |
+
workers=workers,
|
18 |
+
timeout_keep_alive=600,
|
19 |
+
reload=reload,
|
20 |
+
)
|
@@ -1,177 +1,83 @@
|
|
1 |
-
import
|
2 |
-
import
|
|
|
3 |
from contextlib import asynccontextmanager
|
4 |
-
from enum import Enum
|
5 |
from io import BytesIO
|
6 |
-
from
|
7 |
-
|
8 |
-
|
9 |
-
from docling.datamodel.base_models import
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
)
|
15 |
-
from
|
16 |
-
from
|
17 |
-
EasyOcrOptions,
|
18 |
-
OcrOptions,
|
19 |
-
PdfPipelineOptions,
|
20 |
-
RapidOcrOptions,
|
21 |
-
TesseractOcrOptions,
|
22 |
-
)
|
23 |
-
from docling.document_converter import DocumentConverter, PdfFormatOption
|
24 |
-
from docling.utils.profiling import ProfilingItem
|
25 |
-
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
26 |
-
from docling_core.utils.file import resolve_remote_filename
|
27 |
-
from fastapi import FastAPI, HTTPException, Response
|
28 |
-
from pydantic import AnyHttpUrl, BaseModel
|
29 |
-
|
30 |
-
|
31 |
-
# TODO: import enum from Docling, once it is exposed
|
32 |
-
class OcrEngine(str, Enum):
|
33 |
-
EASYOCR = "easyocr"
|
34 |
-
TESSERACT = "tesseract"
|
35 |
-
RAPIDOCR = "rapidocr"
|
36 |
-
|
37 |
-
|
38 |
-
class ConvertOptions(BaseModel):
|
39 |
-
output_docling_document: bool = True
|
40 |
-
output_markdown: bool = False
|
41 |
-
output_html: bool = False
|
42 |
-
do_ocr: bool = True
|
43 |
-
ocr_engine: OcrEngine = OcrEngine.EASYOCR
|
44 |
-
ocr_lang: Optional[List[str]] = None
|
45 |
-
force_ocr: bool = False
|
46 |
-
do_table_structure: bool = True
|
47 |
-
include_images: bool = True
|
48 |
-
images_scale: float = 2.0
|
49 |
-
|
50 |
-
|
51 |
-
class DocumentConvertBase(BaseModel):
|
52 |
-
options: ConvertOptions = ConvertOptions()
|
53 |
-
|
54 |
-
|
55 |
-
class HttpSource(BaseModel):
|
56 |
-
url: str
|
57 |
-
headers: Dict[str, Any] = {}
|
58 |
-
|
59 |
-
|
60 |
-
class FileSource(BaseModel):
|
61 |
-
base64_string: str
|
62 |
-
filename: str
|
63 |
-
|
64 |
-
|
65 |
-
class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
|
66 |
-
http_source: HttpSource
|
67 |
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
|
|
|
71 |
|
|
|
72 |
|
73 |
-
class DocumentResponse(BaseModel):
|
74 |
-
markdown: Optional[str] = None
|
75 |
-
docling_document: Optional[DoclingDocument] = None
|
76 |
-
html: Optional[str] = None
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
timings: Dict[str, ProfilingItem] = {}
|
84 |
-
|
85 |
-
|
86 |
-
class ConvertDocumentErrorResponse(BaseModel):
|
87 |
-
status: ConversionStatus
|
88 |
-
# errors: List[ErrorItem] = []
|
89 |
-
|
90 |
-
|
91 |
-
ConvertDocumentRequest = Union[
|
92 |
-
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
|
93 |
-
]
|
94 |
-
|
95 |
-
|
96 |
-
class MarkdownTextResponse(Response):
|
97 |
-
media_type = "text/markdown"
|
98 |
-
|
99 |
-
|
100 |
-
class HealthCheckResponse(BaseModel):
|
101 |
-
status: str = "ok"
|
102 |
-
|
103 |
-
|
104 |
-
def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
|
105 |
-
|
106 |
-
if options.ocr_engine == OcrEngine.EASYOCR:
|
107 |
-
try:
|
108 |
-
import easyocr # noqa: F401
|
109 |
-
except ImportError:
|
110 |
-
raise HTTPException(
|
111 |
-
status_code=400,
|
112 |
-
detail="The requested OCR engine"
|
113 |
-
f" (ocr_engine={options.ocr_engine.value})"
|
114 |
-
" is not available on this system. Please choose another OCR engine "
|
115 |
-
"or contact your system administrator.",
|
116 |
-
)
|
117 |
-
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
|
118 |
-
elif options.ocr_engine == OcrEngine.TESSERACT:
|
119 |
-
try:
|
120 |
-
import tesserocr # noqa: F401
|
121 |
-
except ImportError:
|
122 |
-
raise HTTPException(
|
123 |
-
status_code=400,
|
124 |
-
detail="The requested OCR engine"
|
125 |
-
f" (ocr_engine={options.ocr_engine.value})"
|
126 |
-
" is not available on this system. Please choose another OCR engine "
|
127 |
-
"or contact your system administrator.",
|
128 |
-
)
|
129 |
-
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
|
130 |
-
elif options.ocr_engine == OcrEngine.RAPIDOCR:
|
131 |
-
try:
|
132 |
-
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
133 |
-
except ImportError:
|
134 |
-
raise HTTPException(
|
135 |
-
status_code=400,
|
136 |
-
detail="The requested OCR engine"
|
137 |
-
f" (ocr_engine={options.ocr_engine.value})"
|
138 |
-
" is not available on this system. Please choose another OCR engine "
|
139 |
-
"or contact your system administrator.",
|
140 |
-
)
|
141 |
-
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
|
142 |
-
else:
|
143 |
-
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
|
144 |
-
|
145 |
-
if options.ocr_lang is not None:
|
146 |
-
ocr_options.lang = options.ocr_lang
|
147 |
-
|
148 |
-
pipeline_options = PdfPipelineOptions(
|
149 |
-
do_ocr=options.do_ocr,
|
150 |
-
ocr_options=ocr_options,
|
151 |
-
do_table_structure=options.do_table_structure,
|
152 |
-
generate_page_images=options.include_images,
|
153 |
-
generate_picture_images=options.include_images,
|
154 |
-
images_scale=options.images_scale,
|
155 |
-
)
|
156 |
|
157 |
-
options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
|
158 |
|
159 |
-
|
|
|
|
|
|
|
|
|
160 |
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
-
|
163 |
|
164 |
|
|
|
165 |
@asynccontextmanager
|
166 |
async def lifespan(app: FastAPI):
|
167 |
# settings = Settings()
|
168 |
|
169 |
# Converter with default options
|
170 |
-
|
171 |
converters[options_hash] = DocumentConverter(
|
172 |
format_options={
|
173 |
-
InputFormat.PDF:
|
174 |
-
InputFormat.IMAGE:
|
175 |
}
|
176 |
)
|
177 |
|
@@ -180,100 +86,139 @@ async def lifespan(app: FastAPI):
|
|
180 |
yield
|
181 |
|
182 |
converters.clear()
|
|
|
|
|
183 |
|
184 |
|
|
|
|
|
|
|
|
|
185 |
app = FastAPI(
|
186 |
title="Docling Serve",
|
187 |
lifespan=lifespan,
|
188 |
)
|
189 |
|
|
|
|
|
|
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
|
200 |
-
filename: str
|
201 |
-
buf: BytesIO
|
202 |
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
buf = BytesIO(http_res.content)
|
209 |
-
filename = resolve_remote_filename(
|
210 |
-
http_url=AnyHttpUrl(body.http_source.url),
|
211 |
-
response_headers=dict(**http_res.headers),
|
212 |
-
)
|
213 |
|
214 |
-
doc_input = DocumentStream(name=filename, stream=buf)
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
format_options={
|
220 |
-
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
221 |
-
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
|
222 |
-
}
|
223 |
-
)
|
224 |
|
225 |
-
result: ConversionResult = converters[options_hash].convert(doc_input)
|
226 |
|
227 |
-
|
228 |
-
|
|
|
229 |
|
230 |
-
if result is None or result.status not in {
|
231 |
-
ConversionStatus.SUCCESS,
|
232 |
-
}:
|
233 |
-
raise HTTPException(
|
234 |
-
status_code=500, detail={"errors": result.errors, "status": result.status}
|
235 |
-
)
|
236 |
|
237 |
-
|
|
|
|
|
|
|
238 |
|
239 |
|
|
|
240 |
@app.post(
|
241 |
-
"/convert",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
)
|
243 |
-
def
|
244 |
-
|
245 |
-
)
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
)
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
doc_resp.html = result.document.export_to_html(image_mode=image_mode)
|
261 |
-
|
262 |
-
return ConvertDocumentResponse(
|
263 |
-
document=doc_resp, status=result.status, timings=result.timings
|
264 |
)
|
265 |
|
|
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
)
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
from contextlib import asynccontextmanager
|
|
|
5 |
from io import BytesIO
|
6 |
+
from pathlib import Path
|
7 |
+
from typing import Annotated, Any, Dict, List, Optional, Union
|
8 |
+
|
9 |
+
from docling.datamodel.base_models import DocumentStream, InputFormat
|
10 |
+
from docling.document_converter import DocumentConverter
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from fastapi import BackgroundTasks, FastAPI, UploadFile
|
13 |
+
from fastapi.middleware.cors import CORSMiddleware
|
14 |
+
from fastapi.responses import RedirectResponse
|
15 |
+
from pydantic import BaseModel
|
16 |
+
|
17 |
+
from docling_serve.docling_conversion import (
|
18 |
+
ConvertDocumentFileSourcesRequest,
|
19 |
+
ConvertDocumentsOptions,
|
20 |
+
ConvertDocumentsRequest,
|
21 |
+
convert_documents,
|
22 |
+
converters,
|
23 |
+
get_pdf_pipeline_opts,
|
24 |
)
|
25 |
+
from docling_serve.helper_functions import FormDepends, _str_to_bool
|
26 |
+
from docling_serve.response_preparation import ConvertDocumentResponse, process_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
# Load local env vars if present
|
29 |
+
load_dotenv()
|
30 |
|
31 |
+
WITH_UI = _str_to_bool(os.getenv("WITH_UI", "False"))
|
32 |
+
if WITH_UI:
|
33 |
+
import gradio as gr
|
34 |
|
35 |
+
from docling_serve.gradio_ui import ui as gradio_ui
|
36 |
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
# Set up custom logging as we'll be intermixes with FastAPI/Uvicorn's logging
|
39 |
+
class ColoredLogFormatter(logging.Formatter):
|
40 |
+
COLOR_CODES = {
|
41 |
+
logging.DEBUG: "\033[94m", # Blue
|
42 |
+
logging.INFO: "\033[92m", # Green
|
43 |
+
logging.WARNING: "\033[93m", # Yellow
|
44 |
+
logging.ERROR: "\033[91m", # Red
|
45 |
+
logging.CRITICAL: "\033[95m", # Magenta
|
46 |
+
}
|
47 |
+
RESET_CODE = "\033[0m"
|
48 |
|
49 |
+
def format(self, record):
|
50 |
+
color = self.COLOR_CODES.get(record.levelno, "")
|
51 |
+
record.levelname = f"{color}{record.levelname}{self.RESET_CODE}"
|
52 |
+
return super().format(record)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
|
|
54 |
|
55 |
+
logging.basicConfig(
|
56 |
+
level=logging.INFO, # Set the logging level
|
57 |
+
format="%(levelname)s:\t%(asctime)s - %(name)s - %(message)s",
|
58 |
+
datefmt="%H:%M:%S",
|
59 |
+
)
|
60 |
|
61 |
+
# Override the formatter with the custom ColoredLogFormatter
|
62 |
+
root_logger = logging.getLogger() # Get the root logger
|
63 |
+
for handler in root_logger.handlers: # Iterate through existing handlers
|
64 |
+
if handler.formatter:
|
65 |
+
handler.setFormatter(ColoredLogFormatter(handler.formatter._fmt))
|
66 |
|
67 |
+
_log = logging.getLogger(__name__)
|
68 |
|
69 |
|
70 |
+
# Context manager to initialize and clean up the lifespan of the FastAPI app
|
71 |
@asynccontextmanager
|
72 |
async def lifespan(app: FastAPI):
|
73 |
# settings = Settings()
|
74 |
|
75 |
# Converter with default options
|
76 |
+
pdf_format_option, options_hash = get_pdf_pipeline_opts(ConvertDocumentsOptions())
|
77 |
converters[options_hash] = DocumentConverter(
|
78 |
format_options={
|
79 |
+
InputFormat.PDF: pdf_format_option,
|
80 |
+
InputFormat.IMAGE: pdf_format_option,
|
81 |
}
|
82 |
)
|
83 |
|
|
|
86 |
yield
|
87 |
|
88 |
converters.clear()
|
89 |
+
if WITH_UI:
|
90 |
+
gradio_ui.close()
|
91 |
|
92 |
|
93 |
+
##################################
|
94 |
+
# App creation and configuration #
|
95 |
+
##################################
|
96 |
+
|
97 |
app = FastAPI(
|
98 |
title="Docling Serve",
|
99 |
lifespan=lifespan,
|
100 |
)
|
101 |
|
102 |
+
origins = ["*"]
|
103 |
+
methods = ["*"]
|
104 |
+
headers = ["*"]
|
105 |
|
106 |
+
app.add_middleware(
|
107 |
+
CORSMiddleware,
|
108 |
+
allow_origins=origins,
|
109 |
+
allow_credentials=True,
|
110 |
+
allow_methods=methods,
|
111 |
+
allow_headers=headers,
|
112 |
+
)
|
113 |
+
|
114 |
+
# Mount the Gradio app
|
115 |
+
if WITH_UI:
|
116 |
+
tmp_output_dir = Path(tempfile.mkdtemp())
|
117 |
+
gradio_ui.gradio_output_dir = tmp_output_dir
|
118 |
+
app = gr.mount_gradio_app(
|
119 |
+
app, gradio_ui, path="/ui", allowed_paths=["./logo.png", tmp_output_dir]
|
120 |
+
)
|
121 |
|
122 |
|
123 |
+
#############################
|
124 |
+
# API Endpoints definitions #
|
125 |
+
#############################
|
126 |
|
|
|
|
|
127 |
|
128 |
+
# Favicon
|
129 |
+
@app.get("/favicon.ico", include_in_schema=False)
|
130 |
+
async def favicon():
|
131 |
+
response = RedirectResponse(url="https://ds4sd.github.io/docling/assets/logo.png")
|
132 |
+
return response
|
|
|
|
|
|
|
|
|
|
|
133 |
|
|
|
134 |
|
135 |
+
# Status
|
136 |
+
class HealthCheckResponse(BaseModel):
|
137 |
+
status: str = "ok"
|
|
|
|
|
|
|
|
|
|
|
138 |
|
|
|
139 |
|
140 |
+
@app.get("/health")
|
141 |
+
def health() -> HealthCheckResponse:
|
142 |
+
return HealthCheckResponse()
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
+
# API readiness compatibility for OpenShift AI Workbench
|
146 |
+
@app.get("/api", include_in_schema=False)
|
147 |
+
def api_check() -> HealthCheckResponse:
|
148 |
+
return HealthCheckResponse()
|
149 |
|
150 |
|
151 |
+
# Convert a document from URL(s)
|
152 |
@app.post(
|
153 |
+
"/v1alpha/convert/source",
|
154 |
+
response_model=ConvertDocumentResponse,
|
155 |
+
responses={
|
156 |
+
200: {
|
157 |
+
"content": {"application/zip": {}},
|
158 |
+
# "description": "Return the JSON item or an image.",
|
159 |
+
}
|
160 |
+
},
|
161 |
)
|
162 |
+
def process_url(
|
163 |
+
background_tasks: BackgroundTasks, conversion_request: ConvertDocumentsRequest
|
164 |
+
):
|
165 |
+
sources: List[Union[str, DocumentStream]] = []
|
166 |
+
headers: Optional[Dict[str, Any]] = None
|
167 |
+
if isinstance(conversion_request, ConvertDocumentFileSourcesRequest):
|
168 |
+
for file_source in conversion_request.file_sources:
|
169 |
+
sources.append(file_source.to_document_stream())
|
170 |
+
else:
|
171 |
+
for http_source in conversion_request.http_sources:
|
172 |
+
sources.append(http_source.url)
|
173 |
+
if headers is None and http_source.headers:
|
174 |
+
headers = http_source.headers
|
175 |
+
|
176 |
+
# Note: results are only an iterator->lazy evaluation
|
177 |
+
results = convert_documents(
|
178 |
+
sources=sources, options=conversion_request.options, headers=headers
|
179 |
)
|
180 |
+
|
181 |
+
# The real processing will happen here
|
182 |
+
response = process_results(
|
183 |
+
background_tasks=background_tasks,
|
184 |
+
conversion_options=conversion_request.options,
|
185 |
+
conv_results=results,
|
|
|
|
|
|
|
|
|
186 |
)
|
187 |
|
188 |
+
return response
|
189 |
|
190 |
+
|
191 |
+
# Convert a document from file(s)
|
192 |
+
@app.post(
|
193 |
+
"/v1alpha/convert/file",
|
194 |
+
response_model=ConvertDocumentResponse,
|
195 |
+
responses={
|
196 |
+
200: {
|
197 |
+
"content": {"application/zip": {}},
|
198 |
+
}
|
199 |
+
},
|
200 |
+
)
|
201 |
+
async def process_file(
|
202 |
+
background_tasks: BackgroundTasks,
|
203 |
+
files: List[UploadFile],
|
204 |
+
options: Annotated[ConvertDocumentsOptions, FormDepends(ConvertDocumentsOptions)],
|
205 |
+
):
|
206 |
+
|
207 |
+
_log.info(f"Received {len(files)} files for processing.")
|
208 |
+
|
209 |
+
# Load the uploaded files to Docling DocumentStream
|
210 |
+
file_sources = []
|
211 |
+
for file in files:
|
212 |
+
buf = BytesIO(file.file.read())
|
213 |
+
name = file.filename if file.filename else "file.pdf"
|
214 |
+
file_sources.append(DocumentStream(name=name, stream=buf))
|
215 |
+
|
216 |
+
results = convert_documents(sources=file_sources, options=options)
|
217 |
+
|
218 |
+
response = process_results(
|
219 |
+
background_tasks=background_tasks,
|
220 |
+
conversion_options=options,
|
221 |
+
conv_results=results,
|
222 |
)
|
223 |
+
|
224 |
+
return response
|
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import hashlib
|
3 |
+
import json
|
4 |
+
import logging
|
5 |
+
from io import BytesIO
|
6 |
+
from pathlib import Path
|
7 |
+
from typing import (
|
8 |
+
Annotated,
|
9 |
+
Any,
|
10 |
+
Dict,
|
11 |
+
Iterable,
|
12 |
+
Iterator,
|
13 |
+
List,
|
14 |
+
Optional,
|
15 |
+
Tuple,
|
16 |
+
Type,
|
17 |
+
Union,
|
18 |
+
)
|
19 |
+
|
20 |
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
21 |
+
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
22 |
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
23 |
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
24 |
+
from docling.datamodel.base_models import DocumentStream, InputFormat, OutputFormat
|
25 |
+
from docling.datamodel.document import ConversionResult
|
26 |
+
from docling.datamodel.pipeline_options import (
|
27 |
+
EasyOcrOptions,
|
28 |
+
OcrEngine,
|
29 |
+
OcrOptions,
|
30 |
+
PdfBackend,
|
31 |
+
PdfPipelineOptions,
|
32 |
+
RapidOcrOptions,
|
33 |
+
TableFormerMode,
|
34 |
+
TesseractOcrOptions,
|
35 |
+
)
|
36 |
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
37 |
+
from docling_core.types.doc import ImageRefMode
|
38 |
+
from fastapi import HTTPException
|
39 |
+
from pydantic import BaseModel, Field
|
40 |
+
|
41 |
+
from docling_serve.helper_functions import _to_list_of_strings
|
42 |
+
|
43 |
+
_log = logging.getLogger(__name__)
|
44 |
+
|
45 |
+
|
46 |
+
# Define the input options for the API
|
47 |
+
class ConvertDocumentsOptions(BaseModel):
|
48 |
+
from_formats: Annotated[
|
49 |
+
List[InputFormat],
|
50 |
+
Field(
|
51 |
+
description=(
|
52 |
+
"Input format(s) to convert from. String or list of strings. "
|
53 |
+
f"Allowed values: {', '.join([v.value for v in InputFormat])}. "
|
54 |
+
"Optional, defaults to all formats."
|
55 |
+
),
|
56 |
+
examples=[[v.value for v in InputFormat]],
|
57 |
+
),
|
58 |
+
] = [v for v in InputFormat]
|
59 |
+
|
60 |
+
to_formats: Annotated[
|
61 |
+
List[OutputFormat],
|
62 |
+
Field(
|
63 |
+
description=(
|
64 |
+
"Output format(s) to convert to. String or list of strings. "
|
65 |
+
f"Allowed values: {', '.join([v.value for v in OutputFormat])}. "
|
66 |
+
"Optional, defaults to Markdown."
|
67 |
+
),
|
68 |
+
examples=[[OutputFormat.MARKDOWN]],
|
69 |
+
),
|
70 |
+
] = [OutputFormat.MARKDOWN]
|
71 |
+
|
72 |
+
image_export_mode: Annotated[
|
73 |
+
ImageRefMode,
|
74 |
+
Field(
|
75 |
+
description=(
|
76 |
+
"Image export mode for the document (in case of JSON,"
|
77 |
+
" Markdown or HTML). "
|
78 |
+
f"Allowed values: {', '.join([v.value for v in ImageRefMode])}. "
|
79 |
+
"Optional, defaults to Embedded."
|
80 |
+
),
|
81 |
+
examples=[ImageRefMode.EMBEDDED.value],
|
82 |
+
# pattern="embedded|placeholder|referenced",
|
83 |
+
),
|
84 |
+
] = ImageRefMode.EMBEDDED
|
85 |
+
|
86 |
+
do_ocr: Annotated[
|
87 |
+
bool,
|
88 |
+
Field(
|
89 |
+
description=(
|
90 |
+
"If enabled, the bitmap content will be processed using OCR. "
|
91 |
+
"Boolean. Optional, defaults to true"
|
92 |
+
),
|
93 |
+
# examples=[True],
|
94 |
+
),
|
95 |
+
] = True
|
96 |
+
|
97 |
+
force_ocr: Annotated[
|
98 |
+
bool,
|
99 |
+
Field(
|
100 |
+
description=(
|
101 |
+
"If enabled, replace existing text with OCR-generated "
|
102 |
+
"text over content. Boolean. Optional, defaults to false."
|
103 |
+
),
|
104 |
+
# examples=[False],
|
105 |
+
),
|
106 |
+
] = False
|
107 |
+
|
108 |
+
# TODO: use a restricted list based on what is installed on the system
|
109 |
+
ocr_engine: Annotated[
|
110 |
+
OcrEngine,
|
111 |
+
Field(
|
112 |
+
description=(
|
113 |
+
"The OCR engine to use. String. "
|
114 |
+
"Allowed values: easyocr, tesseract, rapidocr. "
|
115 |
+
"Optional, defaults to easyocr."
|
116 |
+
),
|
117 |
+
examples=[OcrEngine.EASYOCR],
|
118 |
+
),
|
119 |
+
] = OcrEngine.EASYOCR
|
120 |
+
|
121 |
+
ocr_lang: Annotated[
|
122 |
+
Optional[List[str]],
|
123 |
+
Field(
|
124 |
+
description=(
|
125 |
+
"List of languages used by the OCR engine. "
|
126 |
+
"Note that each OCR engine has "
|
127 |
+
"different values for the language names. String or list of strings. "
|
128 |
+
"Optional, defaults to empty."
|
129 |
+
),
|
130 |
+
examples=[["fr", "de", "es", "en"]],
|
131 |
+
),
|
132 |
+
] = None
|
133 |
+
|
134 |
+
pdf_backend: Annotated[
|
135 |
+
PdfBackend,
|
136 |
+
Field(
|
137 |
+
description=(
|
138 |
+
"The PDF backend to use. String. "
|
139 |
+
f"Allowed values: {', '.join([v.value for v in PdfBackend])}. "
|
140 |
+
f"Optional, defaults to {PdfBackend.DLPARSE_V2.value}."
|
141 |
+
),
|
142 |
+
examples=[PdfBackend.DLPARSE_V2],
|
143 |
+
),
|
144 |
+
] = PdfBackend.DLPARSE_V2
|
145 |
+
|
146 |
+
table_mode: Annotated[
|
147 |
+
TableFormerMode,
|
148 |
+
Field(
|
149 |
+
TableFormerMode.FAST,
|
150 |
+
description=(
|
151 |
+
"Mode to use for table structure, String. "
|
152 |
+
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
|
153 |
+
"Optional, defaults to fast."
|
154 |
+
),
|
155 |
+
examples=[TableFormerMode.FAST],
|
156 |
+
# pattern="fast|accurate",
|
157 |
+
),
|
158 |
+
] = TableFormerMode.FAST
|
159 |
+
|
160 |
+
abort_on_error: Annotated[
|
161 |
+
bool,
|
162 |
+
Field(
|
163 |
+
description=(
|
164 |
+
"Abort on error if enabled. " "Boolean. Optional, defaults to false."
|
165 |
+
),
|
166 |
+
# examples=[False],
|
167 |
+
),
|
168 |
+
] = False
|
169 |
+
|
170 |
+
return_as_file: Annotated[
|
171 |
+
bool,
|
172 |
+
Field(
|
173 |
+
description=(
|
174 |
+
"Return the output as a zip file "
|
175 |
+
"(will happen anyway if multiple files are generated). "
|
176 |
+
"Boolean. Optional, defaults to false."
|
177 |
+
),
|
178 |
+
examples=[False],
|
179 |
+
),
|
180 |
+
] = False
|
181 |
+
|
182 |
+
do_table_structure: Annotated[
|
183 |
+
bool,
|
184 |
+
Field(
|
185 |
+
description=(
|
186 |
+
"If enabled, the table structure will be extracted. "
|
187 |
+
"Boolean. Optional, defaults to true."
|
188 |
+
),
|
189 |
+
examples=[True],
|
190 |
+
),
|
191 |
+
] = True
|
192 |
+
|
193 |
+
include_images: Annotated[
|
194 |
+
bool,
|
195 |
+
Field(
|
196 |
+
description=(
|
197 |
+
"If enabled, images will be extracted from the document. "
|
198 |
+
"Boolean. Optional, defaults to true."
|
199 |
+
),
|
200 |
+
examples=[True],
|
201 |
+
),
|
202 |
+
] = True
|
203 |
+
|
204 |
+
images_scale: Annotated[
|
205 |
+
float,
|
206 |
+
Field(
|
207 |
+
description="Scale factor for images. Float. Optional, defaults to 2.0.",
|
208 |
+
examples=[2.0],
|
209 |
+
),
|
210 |
+
] = 2.0
|
211 |
+
|
212 |
+
|
213 |
+
class DocumentsConvertBase(BaseModel):
|
214 |
+
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
|
215 |
+
|
216 |
+
|
217 |
+
class HttpSource(BaseModel):
|
218 |
+
url: Annotated[
|
219 |
+
str,
|
220 |
+
Field(
|
221 |
+
description="HTTP url to process",
|
222 |
+
examples=["https://arxiv.org/pdf/2206.01062"],
|
223 |
+
),
|
224 |
+
]
|
225 |
+
headers: Annotated[
|
226 |
+
Dict[str, Any],
|
227 |
+
Field(
|
228 |
+
description="Additional headers used to fetch the urls, "
|
229 |
+
"e.g. authorization, agent, etc"
|
230 |
+
),
|
231 |
+
] = {}
|
232 |
+
|
233 |
+
|
234 |
+
class FileSource(BaseModel):
|
235 |
+
base64_string: Annotated[
|
236 |
+
str,
|
237 |
+
Field(
|
238 |
+
description="Content of the file serialized in base64. "
|
239 |
+
"For example it can be obtained via "
|
240 |
+
"`base64 -w 0 /path/to/file/pdf-to-convert.pdf`."
|
241 |
+
),
|
242 |
+
]
|
243 |
+
filename: Annotated[
|
244 |
+
str,
|
245 |
+
Field(description="Filename of the uploaded document", examples=["file.pdf"]),
|
246 |
+
]
|
247 |
+
|
248 |
+
def to_document_stream(self) -> DocumentStream:
|
249 |
+
buf = BytesIO(base64.b64decode(self.base64_string))
|
250 |
+
return DocumentStream(stream=buf, name=self.filename)
|
251 |
+
|
252 |
+
|
253 |
+
class ConvertDocumentHttpSourcesRequest(DocumentsConvertBase):
|
254 |
+
http_sources: List[HttpSource]
|
255 |
+
|
256 |
+
|
257 |
+
class ConvertDocumentFileSourcesRequest(DocumentsConvertBase):
|
258 |
+
file_sources: List[FileSource]
|
259 |
+
|
260 |
+
|
261 |
+
ConvertDocumentsRequest = Union[
|
262 |
+
ConvertDocumentFileSourcesRequest, ConvertDocumentHttpSourcesRequest
|
263 |
+
]
|
264 |
+
|
265 |
+
|
266 |
+
# Document converters will be preloaded and stored in a dictionary
|
267 |
+
converters: Dict[str, DocumentConverter] = {}
|
268 |
+
|
269 |
+
|
270 |
+
# Custom serializer for PdfFormatOption
|
271 |
+
# (model_dump_json does not work with some classes)
|
272 |
+
def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
|
273 |
+
data = pdf_format_option.model_dump()
|
274 |
+
|
275 |
+
# pipeline_options are not fully serialized by model_dump, dedicated pass
|
276 |
+
if pdf_format_option.pipeline_options:
|
277 |
+
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump()
|
278 |
+
|
279 |
+
# Replace `pipeline_cls` with a string representation
|
280 |
+
data["pipeline_cls"] = repr(data["pipeline_cls"])
|
281 |
+
|
282 |
+
# Replace `backend` with a string representation
|
283 |
+
data["backend"] = repr(data["backend"])
|
284 |
+
|
285 |
+
# Handle `device` in `accelerator_options`
|
286 |
+
if "accelerator_options" in data and "device" in data["accelerator_options"]:
|
287 |
+
data["accelerator_options"]["device"] = repr(
|
288 |
+
data["accelerator_options"]["device"]
|
289 |
+
)
|
290 |
+
|
291 |
+
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
292 |
+
return json.dumps(data, sort_keys=True)
|
293 |
+
|
294 |
+
|
295 |
+
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
296 |
+
def get_pdf_pipeline_opts(
|
297 |
+
request: ConvertDocumentsOptions,
|
298 |
+
) -> Tuple[PdfFormatOption, str]:
|
299 |
+
|
300 |
+
if request.ocr_engine == OcrEngine.EASYOCR:
|
301 |
+
try:
|
302 |
+
import easyocr # noqa: F401
|
303 |
+
except ImportError:
|
304 |
+
raise HTTPException(
|
305 |
+
status_code=400,
|
306 |
+
detail="The requested OCR engine"
|
307 |
+
f" (ocr_engine={request.ocr_engine.value})"
|
308 |
+
" is not available on this system. Please choose another OCR engine "
|
309 |
+
"or contact your system administrator.",
|
310 |
+
)
|
311 |
+
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=request.force_ocr)
|
312 |
+
elif request.ocr_engine == OcrEngine.TESSERACT:
|
313 |
+
try:
|
314 |
+
import tesserocr # noqa: F401
|
315 |
+
except ImportError:
|
316 |
+
raise HTTPException(
|
317 |
+
status_code=400,
|
318 |
+
detail="The requested OCR engine"
|
319 |
+
f" (ocr_engine={request.ocr_engine.value})"
|
320 |
+
" is not available on this system. Please choose another OCR engine "
|
321 |
+
"or contact your system administrator.",
|
322 |
+
)
|
323 |
+
ocr_options = TesseractOcrOptions(force_full_page_ocr=request.force_ocr)
|
324 |
+
elif request.ocr_engine == OcrEngine.RAPIDOCR:
|
325 |
+
try:
|
326 |
+
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
327 |
+
except ImportError:
|
328 |
+
raise HTTPException(
|
329 |
+
status_code=400,
|
330 |
+
detail="The requested OCR engine"
|
331 |
+
f" (ocr_engine={request.ocr_engine.value})"
|
332 |
+
" is not available on this system. Please choose another OCR engine "
|
333 |
+
"or contact your system administrator.",
|
334 |
+
)
|
335 |
+
ocr_options = RapidOcrOptions(force_full_page_ocr=request.force_ocr)
|
336 |
+
else:
|
337 |
+
raise RuntimeError(f"Unexpected OCR engine type {request.ocr_engine}")
|
338 |
+
|
339 |
+
if request.ocr_lang is not None:
|
340 |
+
if isinstance(request.ocr_lang, str):
|
341 |
+
ocr_options.lang = _to_list_of_strings(request.ocr_lang)
|
342 |
+
else:
|
343 |
+
ocr_options.lang = request.ocr_lang
|
344 |
+
|
345 |
+
pipeline_options = PdfPipelineOptions(
|
346 |
+
do_ocr=request.do_ocr,
|
347 |
+
ocr_options=ocr_options,
|
348 |
+
do_table_structure=request.do_table_structure,
|
349 |
+
)
|
350 |
+
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
351 |
+
pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)
|
352 |
+
|
353 |
+
if request.image_export_mode != ImageRefMode.PLACEHOLDER:
|
354 |
+
pipeline_options.generate_page_images = True
|
355 |
+
if request.images_scale:
|
356 |
+
pipeline_options.images_scale = request.images_scale
|
357 |
+
|
358 |
+
if request.pdf_backend == PdfBackend.DLPARSE_V1:
|
359 |
+
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
360 |
+
elif request.pdf_backend == PdfBackend.DLPARSE_V2:
|
361 |
+
backend = DoclingParseV2DocumentBackend
|
362 |
+
elif request.pdf_backend == PdfBackend.PYPDFIUM2:
|
363 |
+
backend = PyPdfiumDocumentBackend
|
364 |
+
else:
|
365 |
+
raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")
|
366 |
+
|
367 |
+
pdf_format_option = PdfFormatOption(
|
368 |
+
pipeline_options=pipeline_options,
|
369 |
+
backend=backend,
|
370 |
+
)
|
371 |
+
|
372 |
+
serialized_data = _serialize_pdf_format_option(pdf_format_option)
|
373 |
+
|
374 |
+
options_hash = hashlib.sha1(serialized_data.encode()).hexdigest()
|
375 |
+
|
376 |
+
return pdf_format_option, options_hash
|
377 |
+
|
378 |
+
|
379 |
+
def convert_documents(
|
380 |
+
sources: Iterable[Union[Path, str, DocumentStream]],
|
381 |
+
options: ConvertDocumentsOptions,
|
382 |
+
headers: Optional[Dict[str, Any]] = None,
|
383 |
+
):
|
384 |
+
pdf_format_option, options_hash = get_pdf_pipeline_opts(options)
|
385 |
+
|
386 |
+
if options_hash not in converters:
|
387 |
+
format_options: Dict[InputFormat, FormatOption] = {
|
388 |
+
InputFormat.PDF: pdf_format_option,
|
389 |
+
InputFormat.IMAGE: pdf_format_option,
|
390 |
+
}
|
391 |
+
|
392 |
+
converters[options_hash] = DocumentConverter(format_options=format_options)
|
393 |
+
_log.info(f"We now have {len(converters)} converters in memory.")
|
394 |
+
|
395 |
+
results: Iterator[ConversionResult] = converters[options_hash].convert_all(
|
396 |
+
sources,
|
397 |
+
headers=headers,
|
398 |
+
)
|
399 |
+
|
400 |
+
return results
|
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
import tempfile
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
import requests
|
10 |
+
|
11 |
+
from docling_serve.helper_functions import _to_list_of_strings
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
#################
|
16 |
+
# CSS and theme #
|
17 |
+
#################
|
18 |
+
|
19 |
+
css = """
|
20 |
+
#logo {
|
21 |
+
border-style: none;
|
22 |
+
background: none;
|
23 |
+
box-shadow: none;
|
24 |
+
min-width: 80px;
|
25 |
+
}
|
26 |
+
#dark_mode_column {
|
27 |
+
display: flex;
|
28 |
+
align-content: flex-end;
|
29 |
+
}
|
30 |
+
#title {
|
31 |
+
text-align: left;
|
32 |
+
display:block;
|
33 |
+
height: auto;
|
34 |
+
padding-top: 5px;
|
35 |
+
line-height: 0;
|
36 |
+
}
|
37 |
+
.title-text h1 > p, .title-text p {
|
38 |
+
margin-top: 0px !important;
|
39 |
+
margin-bottom: 2px !important;
|
40 |
+
}
|
41 |
+
#custom-container {
|
42 |
+
border: 0.909091px solid;
|
43 |
+
padding: 10px;
|
44 |
+
border-radius: 4px;
|
45 |
+
}
|
46 |
+
#custom-container h4 {
|
47 |
+
font-size: 14px;
|
48 |
+
}
|
49 |
+
#file_input_zone {
|
50 |
+
height: 140px;
|
51 |
+
}
|
52 |
+
"""
|
53 |
+
|
54 |
+
theme = gr.themes.Default(
|
55 |
+
text_size="md",
|
56 |
+
spacing_size="md",
|
57 |
+
font=[
|
58 |
+
gr.themes.GoogleFont("Red Hat Display"),
|
59 |
+
"ui-sans-serif",
|
60 |
+
"system-ui",
|
61 |
+
"sans-serif",
|
62 |
+
],
|
63 |
+
font_mono=[
|
64 |
+
gr.themes.GoogleFont("Red Hat Mono"),
|
65 |
+
"ui-monospace",
|
66 |
+
"Consolas",
|
67 |
+
"monospace",
|
68 |
+
],
|
69 |
+
)
|
70 |
+
|
71 |
+
#############
|
72 |
+
# Variables #
|
73 |
+
#############
|
74 |
+
|
75 |
+
gradio_output_dir = None # Will be set by FastAPI when mounted
|
76 |
+
file_output_path = None # Will be set when a new file is generated
|
77 |
+
|
78 |
+
#############
|
79 |
+
# Functions #
|
80 |
+
#############
|
81 |
+
|
82 |
+
|
83 |
+
def health_check():
|
84 |
+
response = requests.get(f"http://localhost:{int(os.getenv('PORT', '5001'))}/health")
|
85 |
+
if response.status_code == 200:
|
86 |
+
return "Healthy"
|
87 |
+
return "Unhealthy"
|
88 |
+
|
89 |
+
|
90 |
+
def set_options_visibility(x):
|
91 |
+
return gr.Accordion("Options", open=x)
|
92 |
+
|
93 |
+
|
94 |
+
def set_outputs_visibility_direct(x, y):
|
95 |
+
content = gr.Row(visible=x)
|
96 |
+
file = gr.Row(visible=y)
|
97 |
+
return content, file
|
98 |
+
|
99 |
+
|
100 |
+
def set_outputs_visibility_process(x):
|
101 |
+
content = gr.Row(visible=not x)
|
102 |
+
file = gr.Row(visible=x)
|
103 |
+
return content, file
|
104 |
+
|
105 |
+
|
106 |
+
def set_download_button_label(label_text: gr.State):
|
107 |
+
return gr.DownloadButton(label=str(label_text), scale=1)
|
108 |
+
|
109 |
+
|
110 |
+
def clear_outputs():
|
111 |
+
markdown_content = ""
|
112 |
+
json_content = ""
|
113 |
+
html_content = ""
|
114 |
+
text_content = ""
|
115 |
+
doctags_content = ""
|
116 |
+
|
117 |
+
return (
|
118 |
+
markdown_content,
|
119 |
+
markdown_content,
|
120 |
+
json_content,
|
121 |
+
html_content,
|
122 |
+
html_content,
|
123 |
+
text_content,
|
124 |
+
doctags_content,
|
125 |
+
)
|
126 |
+
|
127 |
+
|
128 |
+
def clear_url_input():
|
129 |
+
return ""
|
130 |
+
|
131 |
+
|
132 |
+
def clear_file_input():
|
133 |
+
return None
|
134 |
+
|
135 |
+
|
136 |
+
def auto_set_return_as_file(url_input, file_input, image_export_mode):
|
137 |
+
# If more than one input source is provided, return as file
|
138 |
+
if (
|
139 |
+
(len(url_input.split(",")) > 1)
|
140 |
+
or (file_input and len(file_input) > 1)
|
141 |
+
or (image_export_mode == "referenced")
|
142 |
+
):
|
143 |
+
return True
|
144 |
+
else:
|
145 |
+
return False
|
146 |
+
|
147 |
+
|
148 |
+
def change_ocr_lang(ocr_engine):
|
149 |
+
if ocr_engine == "easyocr":
|
150 |
+
return "en,fr,de,es"
|
151 |
+
elif ocr_engine == "tesseract_cli":
|
152 |
+
return "eng,fra,deu,spa"
|
153 |
+
elif ocr_engine == "tesseract":
|
154 |
+
return "eng,fra,deu,spa"
|
155 |
+
elif ocr_engine == "rapidocr":
|
156 |
+
return "english,chinese"
|
157 |
+
|
158 |
+
|
159 |
+
def process_url(
|
160 |
+
input_sources,
|
161 |
+
to_formats,
|
162 |
+
image_export_mode,
|
163 |
+
ocr,
|
164 |
+
force_ocr,
|
165 |
+
ocr_engine,
|
166 |
+
ocr_lang,
|
167 |
+
pdf_backend,
|
168 |
+
table_mode,
|
169 |
+
abort_on_error,
|
170 |
+
return_as_file,
|
171 |
+
):
|
172 |
+
parameters = {
|
173 |
+
"http_sources": [{"url": source} for source in input_sources.split(",")],
|
174 |
+
"options": {
|
175 |
+
"to_formats": to_formats,
|
176 |
+
"image_export_mode": image_export_mode,
|
177 |
+
"ocr": ocr,
|
178 |
+
"force_ocr": force_ocr,
|
179 |
+
"ocr_engine": ocr_engine,
|
180 |
+
"ocr_lang": _to_list_of_strings(ocr_lang),
|
181 |
+
"pdf_backend": pdf_backend,
|
182 |
+
"table_mode": table_mode,
|
183 |
+
"abort_on_error": abort_on_error,
|
184 |
+
"return_as_file": return_as_file,
|
185 |
+
},
|
186 |
+
}
|
187 |
+
if (
|
188 |
+
not parameters["http_sources"]
|
189 |
+
or len(parameters["http_sources"]) == 0
|
190 |
+
or parameters["http_sources"][0]["url"] == ""
|
191 |
+
):
|
192 |
+
logger.error("No input sources provided.")
|
193 |
+
raise gr.Error("No input sources provided.", print_exception=False)
|
194 |
+
try:
|
195 |
+
response = requests.post(
|
196 |
+
f"http://localhost:{int(os.getenv('PORT', '5001'))}/v1alpha/convert/source",
|
197 |
+
json=parameters,
|
198 |
+
)
|
199 |
+
except Exception as e:
|
200 |
+
logger.error(f"Error processing URL: {e}")
|
201 |
+
raise gr.Error(f"Error processing URL: {e}", print_exception=False)
|
202 |
+
if response.status_code != 200:
|
203 |
+
data = response.json()
|
204 |
+
error_message = data.get("detail", "An unknown error occurred.")
|
205 |
+
logger.error(f"Error processing file: {error_message}")
|
206 |
+
raise gr.Error(f"Error processing file: {error_message}", print_exception=False)
|
207 |
+
output = response_to_output(response, return_as_file)
|
208 |
+
return output
|
209 |
+
|
210 |
+
|
211 |
+
def process_file(
|
212 |
+
files,
|
213 |
+
to_formats,
|
214 |
+
image_export_mode,
|
215 |
+
ocr,
|
216 |
+
force_ocr,
|
217 |
+
ocr_engine,
|
218 |
+
ocr_lang,
|
219 |
+
pdf_backend,
|
220 |
+
table_mode,
|
221 |
+
abort_on_error,
|
222 |
+
return_as_file,
|
223 |
+
):
|
224 |
+
if not files or len(files) == 0 or files[0] == "":
|
225 |
+
logger.error("No files provided.")
|
226 |
+
raise gr.Error("No files provided.", print_exception=False)
|
227 |
+
files_data = [("files", (file.name, open(file.name, "rb"))) for file in files]
|
228 |
+
|
229 |
+
parameters = {
|
230 |
+
"to_formats": to_formats,
|
231 |
+
"image_export_mode": image_export_mode,
|
232 |
+
"ocr": str(ocr).lower(),
|
233 |
+
"force_ocr": str(force_ocr).lower(),
|
234 |
+
"ocr_engine": ocr_engine,
|
235 |
+
"ocr_lang": _to_list_of_strings(ocr_lang),
|
236 |
+
"pdf_backend": pdf_backend,
|
237 |
+
"table_mode": table_mode,
|
238 |
+
"abort_on_error": str(abort_on_error).lower(),
|
239 |
+
"return_as_file": str(return_as_file).lower(),
|
240 |
+
}
|
241 |
+
|
242 |
+
try:
|
243 |
+
response = requests.post(
|
244 |
+
f"http://localhost:{int(os.getenv('PORT', '5001'))}/v1alpha/convert/file",
|
245 |
+
files=files_data,
|
246 |
+
data=parameters,
|
247 |
+
)
|
248 |
+
except Exception as e:
|
249 |
+
logger.error(f"Error processing file(s): {e}")
|
250 |
+
raise gr.Error(f"Error processing file(s): {e}", print_exception=False)
|
251 |
+
if response.status_code != 200:
|
252 |
+
data = response.json()
|
253 |
+
error_message = data.get("detail", "An unknown error occurred.")
|
254 |
+
logger.error(f"Error processing file: {error_message}")
|
255 |
+
raise gr.Error(f"Error processing file: {error_message}", print_exception=False)
|
256 |
+
output = response_to_output(response, return_as_file)
|
257 |
+
return output
|
258 |
+
|
259 |
+
|
260 |
+
def response_to_output(response, return_as_file):
|
261 |
+
markdown_content = ""
|
262 |
+
json_content = ""
|
263 |
+
html_content = ""
|
264 |
+
text_content = ""
|
265 |
+
doctags_content = ""
|
266 |
+
download_button = gr.DownloadButton(visible=False, label="Download Output", scale=1)
|
267 |
+
if return_as_file:
|
268 |
+
filename = (
|
269 |
+
response.headers.get("Content-Disposition").split("filename=")[1].strip('"')
|
270 |
+
)
|
271 |
+
tmp_output_dir = Path(tempfile.mkdtemp(dir=gradio_output_dir, prefix="ui_"))
|
272 |
+
file_output_path = f"{tmp_output_dir}/{filename}"
|
273 |
+
# logger.info(f"Saving file to: {file_output_path}")
|
274 |
+
with open(file_output_path, "wb") as f:
|
275 |
+
f.write(response.content)
|
276 |
+
download_button = gr.DownloadButton(
|
277 |
+
visible=True, label=f"Download {filename}", scale=1, value=file_output_path
|
278 |
+
)
|
279 |
+
else:
|
280 |
+
full_content = response.json()
|
281 |
+
markdown_content = full_content.get("document").get("md_content")
|
282 |
+
json_content = json.dumps(
|
283 |
+
full_content.get("document").get("json_content"), indent=2
|
284 |
+
)
|
285 |
+
html_content = full_content.get("document").get("html_content")
|
286 |
+
text_content = full_content.get("document").get("text_content")
|
287 |
+
doctags_content = full_content.get("document").get("doctags_content")
|
288 |
+
return (
|
289 |
+
markdown_content,
|
290 |
+
markdown_content,
|
291 |
+
json_content,
|
292 |
+
html_content,
|
293 |
+
html_content,
|
294 |
+
text_content,
|
295 |
+
doctags_content,
|
296 |
+
download_button,
|
297 |
+
)
|
298 |
+
|
299 |
+
|
300 |
+
############
|
301 |
+
# UI Setup #
|
302 |
+
############
|
303 |
+
|
304 |
+
with gr.Blocks(
|
305 |
+
css=css,
|
306 |
+
theme=theme,
|
307 |
+
title="Docling Serve",
|
308 |
+
delete_cache=(3600, 3600), # Delete all files older than 1 hour every hour
|
309 |
+
) as ui:
|
310 |
+
|
311 |
+
# Constants stored in states to be able to pass them as inputs to functions
|
312 |
+
processing_text = gr.State("Processing your document(s), please wait...")
|
313 |
+
true_bool = gr.State(True)
|
314 |
+
false_bool = gr.State(False)
|
315 |
+
|
316 |
+
# Banner
|
317 |
+
with gr.Row(elem_id="check_health"):
|
318 |
+
# Logo
|
319 |
+
with gr.Column(scale=1, min_width=90):
|
320 |
+
gr.Image(
|
321 |
+
"https://ds4sd.github.io/docling/assets/logo.png",
|
322 |
+
height=80,
|
323 |
+
width=80,
|
324 |
+
show_download_button=False,
|
325 |
+
show_label=False,
|
326 |
+
show_fullscreen_button=False,
|
327 |
+
container=False,
|
328 |
+
elem_id="logo",
|
329 |
+
scale=0,
|
330 |
+
)
|
331 |
+
# Title
|
332 |
+
with gr.Column(scale=1, min_width=200):
|
333 |
+
gr.Markdown(
|
334 |
+
f"# Docling Serve \n(docling version: "
|
335 |
+
f"{importlib.metadata.version('docling')})",
|
336 |
+
elem_id="title",
|
337 |
+
elem_classes=["title-text"],
|
338 |
+
)
|
339 |
+
# Dark mode button
|
340 |
+
with gr.Column(scale=16, elem_id="dark_mode_column"):
|
341 |
+
dark_mode_btn = gr.Button("Dark/Light Mode", scale=0)
|
342 |
+
dark_mode_btn.click(
|
343 |
+
None,
|
344 |
+
None,
|
345 |
+
None,
|
346 |
+
js="""() => {
|
347 |
+
if (document.querySelectorAll('.dark').length) {
|
348 |
+
document.querySelectorAll('.dark').forEach(
|
349 |
+
el => el.classList.remove('dark')
|
350 |
+
);
|
351 |
+
} else {
|
352 |
+
document.querySelector('body').classList.add('dark');
|
353 |
+
}
|
354 |
+
}""",
|
355 |
+
show_api=False,
|
356 |
+
)
|
357 |
+
|
358 |
+
# URL Processing Tab
|
359 |
+
with gr.Tab("Convert URL(s)"):
|
360 |
+
with gr.Row():
|
361 |
+
with gr.Column(scale=4):
|
362 |
+
url_input = gr.Textbox(
|
363 |
+
label="Input Sources (comma-separated URLs)",
|
364 |
+
placeholder="https://arxiv.org/pdf/2206.01062",
|
365 |
+
)
|
366 |
+
with gr.Column(scale=1):
|
367 |
+
url_process_btn = gr.Button("Process URL(s)", scale=1)
|
368 |
+
url_reset_btn = gr.Button("Reset", scale=1)
|
369 |
+
|
370 |
+
# File Processing Tab
|
371 |
+
with gr.Tab("Convert File(s)"):
|
372 |
+
with gr.Row():
|
373 |
+
with gr.Column(scale=4):
|
374 |
+
file_input = gr.File(
|
375 |
+
elem_id="file_input_zone",
|
376 |
+
label="Upload Files",
|
377 |
+
file_types=[
|
378 |
+
".pdf",
|
379 |
+
".docx",
|
380 |
+
".pptx",
|
381 |
+
".html",
|
382 |
+
".xlsx",
|
383 |
+
".asciidoc",
|
384 |
+
".txt",
|
385 |
+
".md",
|
386 |
+
".jpg",
|
387 |
+
".jpeg",
|
388 |
+
".png",
|
389 |
+
".gif",
|
390 |
+
],
|
391 |
+
file_count="multiple",
|
392 |
+
scale=4,
|
393 |
+
)
|
394 |
+
with gr.Column(scale=1):
|
395 |
+
file_process_btn = gr.Button("Process File(s)", scale=1)
|
396 |
+
file_reset_btn = gr.Button("Reset", scale=1)
|
397 |
+
|
398 |
+
# Options
|
399 |
+
with gr.Accordion("Options") as options:
|
400 |
+
with gr.Row():
|
401 |
+
with gr.Column(scale=1):
|
402 |
+
to_formats = gr.CheckboxGroup(
|
403 |
+
[
|
404 |
+
("Markdown", "md"),
|
405 |
+
("Docling (JSON)", "json"),
|
406 |
+
("HTML", "html"),
|
407 |
+
("Plain Text", "text"),
|
408 |
+
("Doc Tags", "doctags"),
|
409 |
+
],
|
410 |
+
label="To Formats",
|
411 |
+
value=["md"],
|
412 |
+
)
|
413 |
+
with gr.Column(scale=1):
|
414 |
+
image_export_mode = gr.Radio(
|
415 |
+
[
|
416 |
+
("Embedded", "embedded"),
|
417 |
+
("Placeholder", "placeholder"),
|
418 |
+
("Referenced", "referenced"),
|
419 |
+
],
|
420 |
+
label="Image Export Mode",
|
421 |
+
value="embedded",
|
422 |
+
)
|
423 |
+
with gr.Row():
|
424 |
+
with gr.Column(scale=1, min_width=200):
|
425 |
+
ocr = gr.Checkbox(label="Enable OCR", value=True)
|
426 |
+
force_ocr = gr.Checkbox(label="Force OCR", value=False)
|
427 |
+
with gr.Column(scale=1):
|
428 |
+
ocr_engine = gr.Radio(
|
429 |
+
[
|
430 |
+
("EasyOCR", "easyocr"),
|
431 |
+
("Tesseract", "tesseract"),
|
432 |
+
("RapidOCR", "rapidocr"),
|
433 |
+
],
|
434 |
+
label="OCR Engine",
|
435 |
+
value="easyocr",
|
436 |
+
)
|
437 |
+
with gr.Column(scale=1, min_width=200):
|
438 |
+
ocr_lang = gr.Textbox(
|
439 |
+
label="OCR Language (beware of the format)", value="en,fr,de,es"
|
440 |
+
)
|
441 |
+
ocr_engine.change(change_ocr_lang, inputs=[ocr_engine], outputs=[ocr_lang])
|
442 |
+
with gr.Row():
|
443 |
+
with gr.Column(scale=2):
|
444 |
+
pdf_backend = gr.Radio(
|
445 |
+
["pypdfium2", "dlparse_v1", "dlparse_v2"],
|
446 |
+
label="PDF Backend",
|
447 |
+
value="dlparse_v2",
|
448 |
+
)
|
449 |
+
with gr.Column(scale=2):
|
450 |
+
table_mode = gr.Radio(
|
451 |
+
["fast", "accurate"], label="Table Mode", value="fast"
|
452 |
+
)
|
453 |
+
with gr.Column(scale=1):
|
454 |
+
abort_on_error = gr.Checkbox(label="Abort on Error", value=False)
|
455 |
+
return_as_file = gr.Checkbox(label="Return as File", value=False)
|
456 |
+
|
457 |
+
# Document output
|
458 |
+
with gr.Row(visible=False) as content_output:
|
459 |
+
with gr.Tab("Markdown"):
|
460 |
+
output_markdown = gr.Code(
|
461 |
+
language="markdown", wrap_lines=True, show_label=False
|
462 |
+
)
|
463 |
+
with gr.Tab("Markdown-Rendered"):
|
464 |
+
output_markdown_rendered = gr.Markdown(label="Response")
|
465 |
+
with gr.Tab("Docling (JSON)"):
|
466 |
+
output_json = gr.Code(language="json", wrap_lines=True, show_label=False)
|
467 |
+
with gr.Tab("HTML"):
|
468 |
+
output_html = gr.Code(language="html", wrap_lines=True, show_label=False)
|
469 |
+
with gr.Tab("HTML-Rendered"):
|
470 |
+
output_html_rendered = gr.HTML(label="Response")
|
471 |
+
with gr.Tab("Text"):
|
472 |
+
output_text = gr.Code(wrap_lines=True, show_label=False)
|
473 |
+
with gr.Tab("DocTags"):
|
474 |
+
output_doctags = gr.Code(wrap_lines=True, show_label=False)
|
475 |
+
|
476 |
+
# File download output
|
477 |
+
with gr.Row(visible=False) as file_output:
|
478 |
+
download_file_btn = gr.DownloadButton(label="Placeholder", scale=1)
|
479 |
+
|
480 |
+
##############
|
481 |
+
# UI Actions #
|
482 |
+
##############
|
483 |
+
|
484 |
+
# Handle Return as File
|
485 |
+
url_input.change(
|
486 |
+
auto_set_return_as_file,
|
487 |
+
inputs=[url_input, file_input, image_export_mode],
|
488 |
+
outputs=[return_as_file],
|
489 |
+
)
|
490 |
+
file_input.change(
|
491 |
+
auto_set_return_as_file,
|
492 |
+
inputs=[url_input, file_input, image_export_mode],
|
493 |
+
outputs=[return_as_file],
|
494 |
+
)
|
495 |
+
image_export_mode.change(
|
496 |
+
auto_set_return_as_file,
|
497 |
+
inputs=[url_input, file_input, image_export_mode],
|
498 |
+
outputs=[return_as_file],
|
499 |
+
)
|
500 |
+
|
501 |
+
# URL processing
|
502 |
+
url_process_btn.click(
|
503 |
+
set_options_visibility, inputs=[false_bool], outputs=[options]
|
504 |
+
).then(
|
505 |
+
set_download_button_label, inputs=[processing_text], outputs=[download_file_btn]
|
506 |
+
).then(
|
507 |
+
set_outputs_visibility_process,
|
508 |
+
inputs=[return_as_file],
|
509 |
+
outputs=[content_output, file_output],
|
510 |
+
).then(
|
511 |
+
clear_outputs,
|
512 |
+
inputs=None,
|
513 |
+
outputs=[
|
514 |
+
output_markdown,
|
515 |
+
output_markdown_rendered,
|
516 |
+
output_json,
|
517 |
+
output_html,
|
518 |
+
output_html_rendered,
|
519 |
+
output_text,
|
520 |
+
output_doctags,
|
521 |
+
],
|
522 |
+
).then(
|
523 |
+
process_url,
|
524 |
+
inputs=[
|
525 |
+
url_input,
|
526 |
+
to_formats,
|
527 |
+
image_export_mode,
|
528 |
+
ocr,
|
529 |
+
force_ocr,
|
530 |
+
ocr_engine,
|
531 |
+
ocr_lang,
|
532 |
+
pdf_backend,
|
533 |
+
table_mode,
|
534 |
+
abort_on_error,
|
535 |
+
return_as_file,
|
536 |
+
],
|
537 |
+
outputs=[
|
538 |
+
output_markdown,
|
539 |
+
output_markdown_rendered,
|
540 |
+
output_json,
|
541 |
+
output_html,
|
542 |
+
output_html_rendered,
|
543 |
+
output_text,
|
544 |
+
output_doctags,
|
545 |
+
download_file_btn,
|
546 |
+
],
|
547 |
+
)
|
548 |
+
|
549 |
+
url_reset_btn.click(
|
550 |
+
clear_outputs,
|
551 |
+
inputs=None,
|
552 |
+
outputs=[
|
553 |
+
output_markdown,
|
554 |
+
output_markdown_rendered,
|
555 |
+
output_json,
|
556 |
+
output_html,
|
557 |
+
output_html_rendered,
|
558 |
+
output_text,
|
559 |
+
output_doctags,
|
560 |
+
],
|
561 |
+
).then(set_options_visibility, inputs=[true_bool], outputs=[options]).then(
|
562 |
+
set_outputs_visibility_direct,
|
563 |
+
inputs=[false_bool, false_bool],
|
564 |
+
outputs=[content_output, file_output],
|
565 |
+
).then(
|
566 |
+
clear_url_input, inputs=None, outputs=[url_input]
|
567 |
+
)
|
568 |
+
|
569 |
+
# File processing
|
570 |
+
file_process_btn.click(
|
571 |
+
set_options_visibility, inputs=[false_bool], outputs=[options]
|
572 |
+
).then(
|
573 |
+
set_download_button_label, inputs=[processing_text], outputs=[download_file_btn]
|
574 |
+
).then(
|
575 |
+
set_outputs_visibility_process,
|
576 |
+
inputs=[return_as_file],
|
577 |
+
outputs=[content_output, file_output],
|
578 |
+
).then(
|
579 |
+
clear_outputs,
|
580 |
+
inputs=None,
|
581 |
+
outputs=[
|
582 |
+
output_markdown,
|
583 |
+
output_markdown_rendered,
|
584 |
+
output_json,
|
585 |
+
output_html,
|
586 |
+
output_html_rendered,
|
587 |
+
output_text,
|
588 |
+
output_doctags,
|
589 |
+
],
|
590 |
+
).then(
|
591 |
+
process_file,
|
592 |
+
inputs=[
|
593 |
+
file_input,
|
594 |
+
to_formats,
|
595 |
+
image_export_mode,
|
596 |
+
ocr,
|
597 |
+
force_ocr,
|
598 |
+
ocr_engine,
|
599 |
+
ocr_lang,
|
600 |
+
pdf_backend,
|
601 |
+
table_mode,
|
602 |
+
abort_on_error,
|
603 |
+
return_as_file,
|
604 |
+
],
|
605 |
+
outputs=[
|
606 |
+
output_markdown,
|
607 |
+
output_markdown_rendered,
|
608 |
+
output_json,
|
609 |
+
output_html,
|
610 |
+
output_html_rendered,
|
611 |
+
output_text,
|
612 |
+
output_doctags,
|
613 |
+
download_file_btn,
|
614 |
+
],
|
615 |
+
)
|
616 |
+
|
617 |
+
file_reset_btn.click(
|
618 |
+
clear_outputs,
|
619 |
+
inputs=None,
|
620 |
+
outputs=[
|
621 |
+
output_markdown,
|
622 |
+
output_markdown_rendered,
|
623 |
+
output_json,
|
624 |
+
output_html,
|
625 |
+
output_html_rendered,
|
626 |
+
output_text,
|
627 |
+
output_doctags,
|
628 |
+
],
|
629 |
+
).then(set_options_visibility, inputs=[true_bool], outputs=[options]).then(
|
630 |
+
set_outputs_visibility_direct,
|
631 |
+
inputs=[false_bool, false_bool],
|
632 |
+
outputs=[content_output, file_output],
|
633 |
+
).then(
|
634 |
+
clear_file_input, inputs=None, outputs=[file_input]
|
635 |
+
)
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import inspect
|
2 |
+
import re
|
3 |
+
from typing import List, Type, Union
|
4 |
+
|
5 |
+
from fastapi import Depends, Form
|
6 |
+
from pydantic import BaseModel
|
7 |
+
|
8 |
+
|
9 |
+
# Adapted from
|
10 |
+
# https://github.com/fastapi/fastapi/discussions/8971#discussioncomment-7892972
|
11 |
+
def FormDepends(cls: Type[BaseModel]):
|
12 |
+
new_parameters = []
|
13 |
+
|
14 |
+
for field_name, model_field in cls.model_fields.items():
|
15 |
+
new_parameters.append(
|
16 |
+
inspect.Parameter(
|
17 |
+
name=field_name,
|
18 |
+
kind=inspect.Parameter.POSITIONAL_ONLY,
|
19 |
+
default=(
|
20 |
+
Form(...)
|
21 |
+
if model_field.is_required()
|
22 |
+
else Form(model_field.default)
|
23 |
+
),
|
24 |
+
annotation=model_field.annotation,
|
25 |
+
)
|
26 |
+
)
|
27 |
+
|
28 |
+
async def as_form_func(**data):
|
29 |
+
return cls(**data)
|
30 |
+
|
31 |
+
sig = inspect.signature(as_form_func)
|
32 |
+
sig = sig.replace(parameters=new_parameters)
|
33 |
+
as_form_func.__signature__ = sig # type: ignore
|
34 |
+
return Depends(as_form_func)
|
35 |
+
|
36 |
+
|
37 |
+
def _to_list_of_strings(input_value: Union[str, List[str]]) -> List[str]:
|
38 |
+
def split_and_strip(value: str) -> List[str]:
|
39 |
+
if re.search(r"[;,]", value):
|
40 |
+
return [item.strip() for item in re.split(r"[;,]", value)]
|
41 |
+
else:
|
42 |
+
return [value.strip()]
|
43 |
+
|
44 |
+
if isinstance(input_value, str):
|
45 |
+
return split_and_strip(input_value)
|
46 |
+
elif isinstance(input_value, list):
|
47 |
+
result = []
|
48 |
+
for item in input_value:
|
49 |
+
result.extend(split_and_strip(str(item)))
|
50 |
+
return result
|
51 |
+
else:
|
52 |
+
raise ValueError("Invalid input: must be a string or a list of strings.")
|
53 |
+
|
54 |
+
|
55 |
+
# Helper functions to parse inputs coming as Form objects
|
56 |
+
def _str_to_bool(value: Union[str, bool]) -> bool:
|
57 |
+
if isinstance(value, bool):
|
58 |
+
return value # Already a boolean, return as-is
|
59 |
+
if isinstance(value, str):
|
60 |
+
value = value.strip().lower() # Normalize input
|
61 |
+
return value in ("true", "1", "yes")
|
62 |
+
return False # Default to False if none of the above matches
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import tempfile
|
5 |
+
import time
|
6 |
+
from pathlib import Path
|
7 |
+
from typing import Dict, Iterable, List, Optional, Union
|
8 |
+
|
9 |
+
from docling.datamodel.base_models import OutputFormat
|
10 |
+
from docling.datamodel.document import ConversionResult, ConversionStatus, ErrorItem
|
11 |
+
from docling.utils.profiling import ProfilingItem
|
12 |
+
from docling_core.types.doc import DoclingDocument, ImageRefMode
|
13 |
+
from fastapi import BackgroundTasks, HTTPException
|
14 |
+
from fastapi.responses import FileResponse
|
15 |
+
from pydantic import BaseModel
|
16 |
+
|
17 |
+
from docling_serve.docling_conversion import ConvertDocumentsOptions
|
18 |
+
|
19 |
+
_log = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
class DocumentResponse(BaseModel):
|
23 |
+
filename: str
|
24 |
+
md_content: Optional[str] = None
|
25 |
+
json_content: Optional[DoclingDocument] = None
|
26 |
+
html_content: Optional[str] = None
|
27 |
+
text_content: Optional[str] = None
|
28 |
+
doctags_content: Optional[str] = None
|
29 |
+
|
30 |
+
|
31 |
+
class ConvertDocumentResponse(BaseModel):
|
32 |
+
document: DocumentResponse
|
33 |
+
status: ConversionStatus
|
34 |
+
errors: List[ErrorItem] = []
|
35 |
+
processing_time: float
|
36 |
+
timings: Dict[str, ProfilingItem] = {}
|
37 |
+
|
38 |
+
|
39 |
+
class ConvertDocumentErrorResponse(BaseModel):
|
40 |
+
status: ConversionStatus
|
41 |
+
|
42 |
+
|
43 |
+
def _export_document_as_content(
|
44 |
+
conv_res: ConversionResult,
|
45 |
+
export_json: bool,
|
46 |
+
export_html: bool,
|
47 |
+
export_md: bool,
|
48 |
+
export_txt: bool,
|
49 |
+
export_doctags: bool,
|
50 |
+
image_mode: ImageRefMode,
|
51 |
+
):
|
52 |
+
|
53 |
+
document = DocumentResponse(filename=conv_res.input.file.name)
|
54 |
+
|
55 |
+
if conv_res.status == ConversionStatus.SUCCESS:
|
56 |
+
new_doc = conv_res.document._make_copy_with_refmode(Path(), image_mode)
|
57 |
+
|
58 |
+
# Create the different formats
|
59 |
+
if export_json:
|
60 |
+
document.json_content = new_doc
|
61 |
+
if export_html:
|
62 |
+
document.html_content = new_doc.export_to_html(image_mode=image_mode)
|
63 |
+
if export_txt:
|
64 |
+
document.text_content = new_doc.export_to_markdown(
|
65 |
+
strict_text=True, image_mode=image_mode
|
66 |
+
)
|
67 |
+
if export_md:
|
68 |
+
document.md_content = new_doc.export_to_markdown(image_mode=image_mode)
|
69 |
+
if export_doctags:
|
70 |
+
document.doctags_content = new_doc.export_to_document_tokens()
|
71 |
+
elif conv_res.status == ConversionStatus.SKIPPED:
|
72 |
+
raise HTTPException(status_code=400, detail=conv_res.errors)
|
73 |
+
else:
|
74 |
+
raise HTTPException(status_code=500, detail=conv_res.errors)
|
75 |
+
|
76 |
+
return document
|
77 |
+
|
78 |
+
|
79 |
+
def _export_documents_as_files(
|
80 |
+
conv_results: Iterable[ConversionResult],
|
81 |
+
output_dir: Path,
|
82 |
+
export_json: bool,
|
83 |
+
export_html: bool,
|
84 |
+
export_md: bool,
|
85 |
+
export_txt: bool,
|
86 |
+
export_doctags: bool,
|
87 |
+
image_export_mode: ImageRefMode,
|
88 |
+
):
|
89 |
+
|
90 |
+
success_count = 0
|
91 |
+
failure_count = 0
|
92 |
+
|
93 |
+
for conv_res in conv_results:
|
94 |
+
if conv_res.status == ConversionStatus.SUCCESS:
|
95 |
+
success_count += 1
|
96 |
+
doc_filename = conv_res.input.file.stem
|
97 |
+
|
98 |
+
# Export JSON format:
|
99 |
+
if export_json:
|
100 |
+
fname = output_dir / f"{doc_filename}.json"
|
101 |
+
_log.info(f"writing JSON output to {fname}")
|
102 |
+
conv_res.document.save_as_json(
|
103 |
+
filename=fname, image_mode=image_export_mode
|
104 |
+
)
|
105 |
+
|
106 |
+
# Export HTML format:
|
107 |
+
if export_html:
|
108 |
+
fname = output_dir / f"{doc_filename}.html"
|
109 |
+
_log.info(f"writing HTML output to {fname}")
|
110 |
+
conv_res.document.save_as_html(
|
111 |
+
filename=fname, image_mode=image_export_mode
|
112 |
+
)
|
113 |
+
|
114 |
+
# Export Text format:
|
115 |
+
if export_txt:
|
116 |
+
fname = output_dir / f"{doc_filename}.txt"
|
117 |
+
_log.info(f"writing TXT output to {fname}")
|
118 |
+
conv_res.document.save_as_markdown(
|
119 |
+
filename=fname,
|
120 |
+
strict_text=True,
|
121 |
+
image_mode=ImageRefMode.PLACEHOLDER,
|
122 |
+
)
|
123 |
+
|
124 |
+
# Export Markdown format:
|
125 |
+
if export_md:
|
126 |
+
fname = output_dir / f"{doc_filename}.md"
|
127 |
+
_log.info(f"writing Markdown output to {fname}")
|
128 |
+
conv_res.document.save_as_markdown(
|
129 |
+
filename=fname, image_mode=image_export_mode
|
130 |
+
)
|
131 |
+
|
132 |
+
# Export Document Tags format:
|
133 |
+
if export_doctags:
|
134 |
+
fname = output_dir / f"{doc_filename}.doctags"
|
135 |
+
_log.info(f"writing Doc Tags output to {fname}")
|
136 |
+
conv_res.document.save_as_document_tokens(filename=fname)
|
137 |
+
|
138 |
+
else:
|
139 |
+
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
140 |
+
failure_count += 1
|
141 |
+
|
142 |
+
_log.info(
|
143 |
+
f"Processed {success_count + failure_count} docs, "
|
144 |
+
f"of which {failure_count} failed"
|
145 |
+
)
|
146 |
+
|
147 |
+
|
148 |
+
def process_results(
|
149 |
+
background_tasks: BackgroundTasks,
|
150 |
+
conversion_options: ConvertDocumentsOptions,
|
151 |
+
conv_results: Iterable[ConversionResult],
|
152 |
+
) -> Union[ConvertDocumentResponse, FileResponse]:
|
153 |
+
|
154 |
+
# Let's start by processing the documents
|
155 |
+
try:
|
156 |
+
start_time = time.monotonic()
|
157 |
+
|
158 |
+
# Convert the iterator to a list to count the number of results and get timings
|
159 |
+
# As it's an iterator (lazy evaluation), it will also start the conversion
|
160 |
+
conv_results = list(conv_results)
|
161 |
+
|
162 |
+
processing_time = time.monotonic() - start_time
|
163 |
+
|
164 |
+
_log.info(
|
165 |
+
f"Processed {len(conv_results)} docs in {processing_time:.2f} seconds."
|
166 |
+
)
|
167 |
+
|
168 |
+
except Exception as e:
|
169 |
+
raise HTTPException(status_code=500, detail=str(e))
|
170 |
+
|
171 |
+
if len(conv_results) == 0:
|
172 |
+
raise HTTPException(
|
173 |
+
status_code=500, detail="No documents were generated by Docling."
|
174 |
+
)
|
175 |
+
|
176 |
+
# We have some results, let's prepare the response
|
177 |
+
response: Union[FileResponse, ConvertDocumentResponse]
|
178 |
+
|
179 |
+
# Booleans to know what to export
|
180 |
+
export_json = OutputFormat.JSON in conversion_options.to_formats
|
181 |
+
export_html = OutputFormat.HTML in conversion_options.to_formats
|
182 |
+
export_md = OutputFormat.MARKDOWN in conversion_options.to_formats
|
183 |
+
export_txt = OutputFormat.TEXT in conversion_options.to_formats
|
184 |
+
export_doctags = OutputFormat.DOCTAGS in conversion_options.to_formats
|
185 |
+
|
186 |
+
# Only 1 document was processed, and we are not returning it as a file
|
187 |
+
if len(conv_results) == 1 and not conversion_options.return_as_file:
|
188 |
+
conv_res = conv_results[0]
|
189 |
+
document = _export_document_as_content(
|
190 |
+
conv_res,
|
191 |
+
export_json=export_json,
|
192 |
+
export_html=export_html,
|
193 |
+
export_md=export_md,
|
194 |
+
export_txt=export_txt,
|
195 |
+
export_doctags=export_doctags,
|
196 |
+
image_mode=conversion_options.image_export_mode,
|
197 |
+
)
|
198 |
+
|
199 |
+
response = ConvertDocumentResponse(
|
200 |
+
document=document,
|
201 |
+
status=conv_res.status,
|
202 |
+
processing_time=processing_time,
|
203 |
+
timings=conv_res.timings,
|
204 |
+
)
|
205 |
+
|
206 |
+
# Multiple documents were processed, or we are forced returning as a file
|
207 |
+
else:
|
208 |
+
# Temporary directory to store the outputs
|
209 |
+
work_dir = Path(tempfile.mkdtemp(prefix="docling_"))
|
210 |
+
output_dir = work_dir / "output"
|
211 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
212 |
+
|
213 |
+
# Worker pid to use in archive identification as we may have multiple workers
|
214 |
+
os.getpid()
|
215 |
+
|
216 |
+
# Export the documents
|
217 |
+
_export_documents_as_files(
|
218 |
+
conv_results=conv_results,
|
219 |
+
output_dir=output_dir,
|
220 |
+
export_json=export_json,
|
221 |
+
export_html=export_html,
|
222 |
+
export_md=export_md,
|
223 |
+
export_txt=export_txt,
|
224 |
+
export_doctags=export_doctags,
|
225 |
+
image_export_mode=conversion_options.image_export_mode,
|
226 |
+
)
|
227 |
+
|
228 |
+
files = os.listdir(output_dir)
|
229 |
+
|
230 |
+
if len(files) == 0:
|
231 |
+
raise HTTPException(status_code=500, detail="No documents were exported.")
|
232 |
+
|
233 |
+
file_path = work_dir / "converted_docs.zip"
|
234 |
+
shutil.make_archive(
|
235 |
+
base_name=str(file_path.with_suffix("")),
|
236 |
+
format="zip",
|
237 |
+
root_dir=output_dir,
|
238 |
+
)
|
239 |
+
|
240 |
+
# Other cleanups after the response is sent
|
241 |
+
# Output directory
|
242 |
+
background_tasks.add_task(shutil.rmtree, work_dir, ignore_errors=True)
|
243 |
+
|
244 |
+
response = FileResponse(
|
245 |
+
file_path, filename=file_path.name, media_type="application/zip"
|
246 |
+
)
|
247 |
+
|
248 |
+
return response
|
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
The diff for this file is too large to render.
See raw diff
|
|
@@ -30,11 +30,14 @@ classifiers = [
|
|
30 |
]
|
31 |
|
32 |
[tool.poetry.dependencies]
|
33 |
-
python = "
|
34 |
-
docling = "^2.
|
35 |
fastapi = {version = "^0.115.6", extras = ["standard"]}
|
36 |
-
|
|
|
|
|
37 |
pydantic-settings = "^2.4.0"
|
|
|
38 |
httpx = "^0.28.1"
|
39 |
tesserocr = { version = "^2.7.1", optional = true }
|
40 |
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
@@ -47,6 +50,7 @@ onnxruntime = [
|
|
47 |
|
48 |
|
49 |
[tool.poetry.extras]
|
|
|
50 |
tesserocr = ["tesserocr"]
|
51 |
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
52 |
|
@@ -89,7 +93,9 @@ isort = "^5.13.2"
|
|
89 |
pre-commit = "^3.8.0"
|
90 |
autoflake = "^2.3.1"
|
91 |
flake8 = "^7.1.1"
|
92 |
-
pytest = "^8.3.
|
|
|
|
|
93 |
mypy = "^1.11.2"
|
94 |
|
95 |
[build-system]
|
@@ -125,5 +131,22 @@ module = [
|
|
125 |
"easyocr.*",
|
126 |
"tesserocr.*",
|
127 |
"rapidocr_onnxruntime.*",
|
|
|
|
|
|
|
|
|
|
|
128 |
]
|
129 |
ignore_missing_imports = true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
]
|
31 |
|
32 |
[tool.poetry.dependencies]
|
33 |
+
python = ">=3.10,<3.13" # 3.10 needed for Gradio, and no torchvision build for 3.13 yet
|
34 |
+
docling = "^2.14.0"
|
35 |
fastapi = {version = "^0.115.6", extras = ["standard"]}
|
36 |
+
gradio = { version = "^5.9.1", optional = true }
|
37 |
+
uvicorn = "~0.29.0"
|
38 |
+
pydantic = "^2.10.3"
|
39 |
pydantic-settings = "^2.4.0"
|
40 |
+
python-multipart = "^0.0.19"
|
41 |
httpx = "^0.28.1"
|
42 |
tesserocr = { version = "^2.7.1", optional = true }
|
43 |
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
|
|
50 |
|
51 |
|
52 |
[tool.poetry.extras]
|
53 |
+
ui = ["gradio"]
|
54 |
tesserocr = ["tesserocr"]
|
55 |
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
56 |
|
|
|
93 |
pre-commit = "^3.8.0"
|
94 |
autoflake = "^2.3.1"
|
95 |
flake8 = "^7.1.1"
|
96 |
+
pytest = "^8.3.4"
|
97 |
+
pytest-asyncio = "^0.24.0"
|
98 |
+
pytest-check = "^2.4.1"
|
99 |
mypy = "^1.11.2"
|
100 |
|
101 |
[build-system]
|
|
|
131 |
"easyocr.*",
|
132 |
"tesserocr.*",
|
133 |
"rapidocr_onnxruntime.*",
|
134 |
+
"docling_conversion.*",
|
135 |
+
"gradio_ui.*",
|
136 |
+
"response_preparation.*",
|
137 |
+
"helper_functions.*",
|
138 |
+
"requests.*",
|
139 |
]
|
140 |
ignore_missing_imports = true
|
141 |
+
|
142 |
+
[tool.pytest.ini_options]
|
143 |
+
asyncio_mode = "auto"
|
144 |
+
asyncio_default_fixture_loop_scope = "function"
|
145 |
+
minversion = "8.2"
|
146 |
+
testpaths = [
|
147 |
+
"tests",
|
148 |
+
]
|
149 |
+
addopts = "-rA --color=yes --tb=short --maxfail=5"
|
150 |
+
markers = [
|
151 |
+
"asyncio",
|
152 |
+
]
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -Eeuo pipefail
|
3 |
+
|
4 |
+
# Network settings
|
5 |
+
export PORT="${PORT:-5001}"
|
6 |
+
export HOST="${HOST:-"0.0.0.0"}"
|
7 |
+
|
8 |
+
# Performance settings
|
9 |
+
UVICORN_WORKERS="${WITH_UI:-1}"
|
10 |
+
|
11 |
+
# Development settings
|
12 |
+
export WITH_UI="${WITH_UI:-"true"}"
|
13 |
+
export RELOAD=${RELOAD:-"false"}
|
14 |
+
|
15 |
+
# --------------------------------------
|
16 |
+
# Process env settings
|
17 |
+
|
18 |
+
EXTRA_ARGS=""
|
19 |
+
if [ "$RELOAD" == "true" ]; then
|
20 |
+
EXTRA_ARGS="$EXTRA_ARGS --reload"
|
21 |
+
fi
|
22 |
+
|
23 |
+
# Launch
|
24 |
+
exec poetry run uvicorn \
|
25 |
+
docling_serve.app:app \
|
26 |
+
--host=${HOST} \
|
27 |
+
--port=${PORT} \
|
28 |
+
--timeout-keep-alive=600 \
|
29 |
+
${EXTRA_ARGS} \
|
30 |
+
--workers=${UVICORN_WORKERS}
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc
|
3 |
+
size 4310680
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82dd470712ce8389f19f20eb9330475e2166a281f8c7990a9f1d0763d73b4d22
|
3 |
+
size 5566575
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import httpx
|
5 |
+
import pytest
|
6 |
+
import pytest_asyncio
|
7 |
+
from pytest_check import check
|
8 |
+
|
9 |
+
|
10 |
+
@pytest_asyncio.fixture
|
11 |
+
async def async_client():
|
12 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
13 |
+
yield client
|
14 |
+
|
15 |
+
|
16 |
+
@pytest.mark.asyncio
|
17 |
+
async def test_convert_file(async_client):
|
18 |
+
"""Test convert single file to all outputs"""
|
19 |
+
url = "http://localhost:5001/v1alpha/convert/file"
|
20 |
+
options = {
|
21 |
+
"from_formats": [
|
22 |
+
"docx",
|
23 |
+
"pptx",
|
24 |
+
"html",
|
25 |
+
"image",
|
26 |
+
"pdf",
|
27 |
+
"asciidoc",
|
28 |
+
"md",
|
29 |
+
"xlsx",
|
30 |
+
],
|
31 |
+
"to_formats": ["md", "json", "html", "text", "doctags"],
|
32 |
+
"image_export_mode": "placeholder",
|
33 |
+
"ocr": True,
|
34 |
+
"force_ocr": False,
|
35 |
+
"ocr_engine": "easyocr",
|
36 |
+
"ocr_lang": ["en"],
|
37 |
+
"pdf_backend": "dlparse_v2",
|
38 |
+
"table_mode": "fast",
|
39 |
+
"abort_on_error": False,
|
40 |
+
"return_as_file": False,
|
41 |
+
}
|
42 |
+
|
43 |
+
current_dir = os.path.dirname(__file__)
|
44 |
+
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
|
45 |
+
|
46 |
+
files = {
|
47 |
+
"files": ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf"),
|
48 |
+
}
|
49 |
+
|
50 |
+
response = await async_client.post(
|
51 |
+
url, files=files, data={"options": json.dumps(options)}
|
52 |
+
)
|
53 |
+
assert response.status_code == 200, "Response should be 200 OK"
|
54 |
+
|
55 |
+
data = response.json()
|
56 |
+
|
57 |
+
# Response content checks
|
58 |
+
# Helper function to safely slice strings
|
59 |
+
def safe_slice(value, length=100):
|
60 |
+
if isinstance(value, str):
|
61 |
+
return value[:length]
|
62 |
+
return str(value) # Convert non-string values to string for debug purposes
|
63 |
+
|
64 |
+
# Document check
|
65 |
+
check.is_in(
|
66 |
+
"document",
|
67 |
+
data,
|
68 |
+
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
|
69 |
+
)
|
70 |
+
# MD check
|
71 |
+
check.is_in(
|
72 |
+
"md_content",
|
73 |
+
data.get("document", {}),
|
74 |
+
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
75 |
+
)
|
76 |
+
if data.get("document", {}).get("md_content") is not None:
|
77 |
+
check.is_in(
|
78 |
+
"## DocLayNet: ",
|
79 |
+
data["document"]["md_content"],
|
80 |
+
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
|
81 |
+
)
|
82 |
+
# JSON check
|
83 |
+
check.is_in(
|
84 |
+
"json_content",
|
85 |
+
data.get("document", {}),
|
86 |
+
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
87 |
+
)
|
88 |
+
if data.get("document", {}).get("json_content") is not None:
|
89 |
+
check.is_in(
|
90 |
+
'{"schema_name": "DoclingDocument"',
|
91 |
+
json.dumps(data["document"]["json_content"]),
|
92 |
+
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
|
93 |
+
)
|
94 |
+
# HTML check
|
95 |
+
check.is_in(
|
96 |
+
"html_content",
|
97 |
+
data.get("document", {}),
|
98 |
+
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
99 |
+
)
|
100 |
+
if data.get("document", {}).get("html_content") is not None:
|
101 |
+
check.is_in(
|
102 |
+
'<!DOCTYPE html>\n<html lang="en">\n<head>',
|
103 |
+
data["document"]["html_content"],
|
104 |
+
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
|
105 |
+
)
|
106 |
+
# Text check
|
107 |
+
check.is_in(
|
108 |
+
"text_content",
|
109 |
+
data.get("document", {}),
|
110 |
+
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
111 |
+
)
|
112 |
+
if data.get("document", {}).get("text_content") is not None:
|
113 |
+
check.is_in(
|
114 |
+
"DocLayNet: A Large Human-Annotated Dataset",
|
115 |
+
data["document"]["text_content"],
|
116 |
+
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
|
117 |
+
)
|
118 |
+
# DocTags check
|
119 |
+
check.is_in(
|
120 |
+
"doctags_content",
|
121 |
+
data.get("document", {}),
|
122 |
+
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
123 |
+
)
|
124 |
+
if data.get("document", {}).get("doctags_content") is not None:
|
125 |
+
check.is_in(
|
126 |
+
"<document>\n<section_header_level_1><location>",
|
127 |
+
data["document"]["doctags_content"],
|
128 |
+
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
|
129 |
+
)
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import httpx
|
4 |
+
import pytest
|
5 |
+
import pytest_asyncio
|
6 |
+
from pytest_check import check
|
7 |
+
|
8 |
+
|
9 |
+
@pytest_asyncio.fixture
|
10 |
+
async def async_client():
|
11 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
12 |
+
yield client
|
13 |
+
|
14 |
+
|
15 |
+
@pytest.mark.asyncio
|
16 |
+
async def test_convert_url(async_client):
|
17 |
+
"""Test convert URL to all outputs"""
|
18 |
+
url = "http://localhost:5001/v1alpha/convert/source"
|
19 |
+
payload = {
|
20 |
+
"options": {
|
21 |
+
"from_formats": [
|
22 |
+
"docx",
|
23 |
+
"pptx",
|
24 |
+
"html",
|
25 |
+
"image",
|
26 |
+
"pdf",
|
27 |
+
"asciidoc",
|
28 |
+
"md",
|
29 |
+
"xlsx",
|
30 |
+
],
|
31 |
+
"to_formats": ["md", "json", "html", "text", "doctags"],
|
32 |
+
"image_export_mode": "placeholder",
|
33 |
+
"ocr": True,
|
34 |
+
"force_ocr": False,
|
35 |
+
"ocr_engine": "easyocr",
|
36 |
+
"ocr_lang": ["en"],
|
37 |
+
"pdf_backend": "dlparse_v2",
|
38 |
+
"table_mode": "fast",
|
39 |
+
"abort_on_error": False,
|
40 |
+
"return_as_file": False,
|
41 |
+
},
|
42 |
+
"http_sources": [{"url": "https://arxiv.org/pdf/2206.01062"}],
|
43 |
+
}
|
44 |
+
print(json.dumps(payload, indent=2))
|
45 |
+
|
46 |
+
response = await async_client.post(url, json=payload)
|
47 |
+
assert response.status_code == 200, "Response should be 200 OK"
|
48 |
+
|
49 |
+
data = response.json()
|
50 |
+
|
51 |
+
# Response content checks
|
52 |
+
# Helper function to safely slice strings
|
53 |
+
def safe_slice(value, length=100):
|
54 |
+
if isinstance(value, str):
|
55 |
+
return value[:length]
|
56 |
+
return str(value) # Convert non-string values to string for debug purposes
|
57 |
+
|
58 |
+
# Document check
|
59 |
+
check.is_in(
|
60 |
+
"document",
|
61 |
+
data,
|
62 |
+
msg=f"Response should contain 'document' key. Received keys: {list(data.keys())}",
|
63 |
+
)
|
64 |
+
# MD check
|
65 |
+
check.is_in(
|
66 |
+
"md_content",
|
67 |
+
data.get("document", {}),
|
68 |
+
msg=f"Response should contain 'md_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
69 |
+
)
|
70 |
+
if data.get("document", {}).get("md_content") is not None:
|
71 |
+
check.is_in(
|
72 |
+
"## DocLayNet: ",
|
73 |
+
data["document"]["md_content"],
|
74 |
+
msg=f"Markdown document should contain 'DocLayNet: '. Received: {safe_slice(data['document']['md_content'])}",
|
75 |
+
)
|
76 |
+
# JSON check
|
77 |
+
check.is_in(
|
78 |
+
"json_content",
|
79 |
+
data.get("document", {}),
|
80 |
+
msg=f"Response should contain 'json_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
81 |
+
)
|
82 |
+
if data.get("document", {}).get("json_content") is not None:
|
83 |
+
check.is_in(
|
84 |
+
'{"schema_name": "DoclingDocument"',
|
85 |
+
json.dumps(data["document"]["json_content"]),
|
86 |
+
msg=f"JSON document should contain '{{\\n \"schema_name\": \"DoclingDocument'\". Received: {safe_slice(data['document']['json_content'])}",
|
87 |
+
)
|
88 |
+
# HTML check
|
89 |
+
check.is_in(
|
90 |
+
"html_content",
|
91 |
+
data.get("document", {}),
|
92 |
+
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
93 |
+
)
|
94 |
+
if data.get("document", {}).get("html_content") is not None:
|
95 |
+
check.is_in(
|
96 |
+
'<!DOCTYPE html>\n<html lang="en">\n<head>',
|
97 |
+
data["document"]["html_content"],
|
98 |
+
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
|
99 |
+
)
|
100 |
+
# Text check
|
101 |
+
check.is_in(
|
102 |
+
"text_content",
|
103 |
+
data.get("document", {}),
|
104 |
+
msg=f"Response should contain 'text_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
105 |
+
)
|
106 |
+
if data.get("document", {}).get("text_content") is not None:
|
107 |
+
check.is_in(
|
108 |
+
"DocLayNet: A Large Human-Annotated Dataset",
|
109 |
+
data["document"]["text_content"],
|
110 |
+
msg=f"Text document should contain 'DocLayNet: A Large Human-Annotated Dataset'. Received: {safe_slice(data['document']['text_content'])}",
|
111 |
+
)
|
112 |
+
# DocTags check
|
113 |
+
check.is_in(
|
114 |
+
"doctags_content",
|
115 |
+
data.get("document", {}),
|
116 |
+
msg=f"Response should contain 'doctags_content' key. Received keys: {list(data.get('document', {}).keys())}",
|
117 |
+
)
|
118 |
+
if data.get("document", {}).get("doctags_content") is not None:
|
119 |
+
check.is_in(
|
120 |
+
"<document>\n<section_header_level_1><location>",
|
121 |
+
data["document"]["doctags_content"],
|
122 |
+
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
|
123 |
+
)
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import httpx
|
5 |
+
import pytest
|
6 |
+
import pytest_asyncio
|
7 |
+
from pytest_check import check
|
8 |
+
|
9 |
+
|
10 |
+
@pytest_asyncio.fixture
|
11 |
+
async def async_client():
|
12 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
13 |
+
yield client
|
14 |
+
|
15 |
+
|
16 |
+
@pytest.mark.asyncio
|
17 |
+
async def test_convert_file(async_client):
|
18 |
+
"""Test convert single file to all outputs"""
|
19 |
+
url = "http://localhost:5001/v1alpha/convert/file"
|
20 |
+
options = {
|
21 |
+
"from_formats": [
|
22 |
+
"docx",
|
23 |
+
"pptx",
|
24 |
+
"html",
|
25 |
+
"image",
|
26 |
+
"pdf",
|
27 |
+
"asciidoc",
|
28 |
+
"md",
|
29 |
+
"xlsx",
|
30 |
+
],
|
31 |
+
"to_formats": ["md", "json", "html", "text", "doctags"],
|
32 |
+
"image_export_mode": "placeholder",
|
33 |
+
"ocr": True,
|
34 |
+
"force_ocr": False,
|
35 |
+
"ocr_engine": "easyocr",
|
36 |
+
"ocr_lang": ["en"],
|
37 |
+
"pdf_backend": "dlparse_v2",
|
38 |
+
"table_mode": "fast",
|
39 |
+
"abort_on_error": False,
|
40 |
+
"return_as_file": False,
|
41 |
+
}
|
42 |
+
|
43 |
+
current_dir = os.path.dirname(__file__)
|
44 |
+
file_path = os.path.join(current_dir, "2206.01062v1.pdf")
|
45 |
+
|
46 |
+
files = [
|
47 |
+
("files", ("2206.01062v1.pdf", open(file_path, "rb"), "application/pdf")),
|
48 |
+
("files", ("2408.09869v5.pdf", open(file_path, "rb"), "application/pdf")),
|
49 |
+
]
|
50 |
+
|
51 |
+
response = await async_client.post(
|
52 |
+
url, files=files, data={"options": json.dumps(options)}
|
53 |
+
)
|
54 |
+
assert response.status_code == 200, "Response should be 200 OK"
|
55 |
+
|
56 |
+
# Check for zip file attachment
|
57 |
+
content_disposition = response.headers.get("content-disposition")
|
58 |
+
|
59 |
+
with check:
|
60 |
+
assert (
|
61 |
+
content_disposition is not None
|
62 |
+
), "Content-Disposition header should be present"
|
63 |
+
with check:
|
64 |
+
assert "attachment" in content_disposition, "Response should be an attachment"
|
65 |
+
with check:
|
66 |
+
assert (
|
67 |
+
'filename="converted_docs.zip"' in content_disposition
|
68 |
+
), "Attachment filename should be 'converted_docs.zip'"
|
69 |
+
|
70 |
+
content_type = response.headers.get("content-type")
|
71 |
+
with check:
|
72 |
+
assert (
|
73 |
+
content_type == "application/zip"
|
74 |
+
), "Content-Type should be 'application/zip'"
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import httpx
|
2 |
+
import pytest
|
3 |
+
import pytest_asyncio
|
4 |
+
from pytest_check import check
|
5 |
+
|
6 |
+
|
7 |
+
@pytest_asyncio.fixture
|
8 |
+
async def async_client():
|
9 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
10 |
+
yield client
|
11 |
+
|
12 |
+
|
13 |
+
@pytest.mark.asyncio
|
14 |
+
async def test_convert_url(async_client):
|
15 |
+
"""Test convert URL to all outputs"""
|
16 |
+
url = "http://localhost:5001/v1alpha/convert/source"
|
17 |
+
payload = {
|
18 |
+
"options": {
|
19 |
+
"from_formats": [
|
20 |
+
"docx",
|
21 |
+
"pptx",
|
22 |
+
"html",
|
23 |
+
"image",
|
24 |
+
"pdf",
|
25 |
+
"asciidoc",
|
26 |
+
"md",
|
27 |
+
"xlsx",
|
28 |
+
],
|
29 |
+
"to_formats": ["md", "json", "html", "text", "doctags"],
|
30 |
+
"image_export_mode": "placeholder",
|
31 |
+
"ocr": True,
|
32 |
+
"force_ocr": False,
|
33 |
+
"ocr_engine": "easyocr",
|
34 |
+
"ocr_lang": ["en"],
|
35 |
+
"pdf_backend": "dlparse_v2",
|
36 |
+
"table_mode": "fast",
|
37 |
+
"abort_on_error": False,
|
38 |
+
"return_as_file": False,
|
39 |
+
},
|
40 |
+
"http_sources": [
|
41 |
+
{"url": "https://arxiv.org/pdf/2206.01062"},
|
42 |
+
{"url": "https://arxiv.org/pdf/2408.09869"},
|
43 |
+
],
|
44 |
+
}
|
45 |
+
|
46 |
+
response = await async_client.post(url, json=payload)
|
47 |
+
assert response.status_code == 200, "Response should be 200 OK"
|
48 |
+
|
49 |
+
# Check for zip file attachment
|
50 |
+
content_disposition = response.headers.get("content-disposition")
|
51 |
+
|
52 |
+
with check:
|
53 |
+
assert (
|
54 |
+
content_disposition is not None
|
55 |
+
), "Content-Disposition header should be present"
|
56 |
+
with check:
|
57 |
+
assert "attachment" in content_disposition, "Response should be an attachment"
|
58 |
+
with check:
|
59 |
+
assert (
|
60 |
+
'filename="converted_docs.zip"' in content_disposition
|
61 |
+
), "Attachment filename should be 'converted_docs.zip'"
|
62 |
+
|
63 |
+
content_type = response.headers.get("content-type")
|
64 |
+
with check:
|
65 |
+
assert (
|
66 |
+
content_type == "application/zip"
|
67 |
+
), "Content-Type should be 'application/zip'"
|