Spaces:
Configuration error
Configuration error
Commit
·
1ea04fb
1
Parent(s):
8937613
make the image openshift-friendly
Browse filesSigned-off-by: Michele Dolfi <[email protected]>
- Containerfile +46 -17
- models_download.py +36 -0
- os-packages.txt +8 -0
Containerfile
CHANGED
@@ -1,32 +1,61 @@
|
|
1 |
-
|
|
|
|
|
2 |
|
3 |
ARG CPU_ONLY=false
|
4 |
-
WORKDIR /docling-serve
|
5 |
|
6 |
-
|
7 |
-
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
|
8 |
-
&& apt-get clean
|
9 |
|
10 |
-
|
|
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
poetry install --no-root --with cpu; \
|
16 |
-
else \
|
17 |
-
poetry install --no-root; \
|
18 |
-
fi
|
19 |
|
20 |
-
|
21 |
-
|
|
|
22 |
|
23 |
-
|
|
|
|
|
24 |
|
25 |
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
26 |
ENV OMP_NUM_THREADS=4
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
EXPOSE 5001
|
31 |
|
32 |
-
CMD ["
|
|
|
1 |
+
ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s
|
2 |
+
|
3 |
+
FROM ${BASE_IMAGE}
|
4 |
|
5 |
ARG CPU_ONLY=false
|
|
|
6 |
|
7 |
+
USER 0
|
|
|
|
|
8 |
|
9 |
+
###################################################################################################
|
10 |
+
# OS Layer #
|
11 |
+
###################################################################################################
|
12 |
|
13 |
+
RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
|
14 |
+
dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
|
15 |
+
dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
|
16 |
+
dnf config-manager --enable crb && \
|
17 |
+
dnf -y update && \
|
18 |
+
dnf install -y $(cat /tmp/os-packages.txt) && \
|
19 |
+
dnf -y clean all && \
|
20 |
+
rm -rf /var/cache/dnf
|
21 |
|
22 |
+
ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
###################################################################################################
|
25 |
+
# Docling layer #
|
26 |
+
###################################################################################################
|
27 |
|
28 |
+
USER 1001
|
29 |
+
|
30 |
+
WORKDIR /opt/app-root/src
|
31 |
|
32 |
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
33 |
ENV OMP_NUM_THREADS=4
|
34 |
|
35 |
+
ENV LANG=en_US.UTF-8
|
36 |
+
ENV LC_ALL=en_US.UTF-8
|
37 |
+
ENV PYTHONIOENCODING=utf-8
|
38 |
+
|
39 |
+
ENV WITH_UI=True
|
40 |
+
|
41 |
+
COPY --chown=1001:0 pyproject.toml poetry.lock models_download.py README.md ./
|
42 |
+
|
43 |
+
RUN pip install --no-cache-dir poetry && \
|
44 |
+
# We already are in a virtual environment, so we don't need to create a new one, only activate it.
|
45 |
+
poetry config virtualenvs.create false && \
|
46 |
+
source /opt/app-root/bin/activate && \
|
47 |
+
if [ "$CPU_ONLY" = "true" ]; then \
|
48 |
+
poetry install --no-root --no-cache --no-interaction --all-extras --with cpu --without dev; \
|
49 |
+
else \
|
50 |
+
poetry install --no-root --no-cache --no-interaction --all-extras --without dev; \
|
51 |
+
fi && \
|
52 |
+
echo "Downloading models..." && \
|
53 |
+
python models_download.py && \
|
54 |
+
chown -R 1001:0 /opt/app-root/src && \
|
55 |
+
chmod -R g=u /opt/app-root/src
|
56 |
+
|
57 |
+
COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve
|
58 |
|
59 |
EXPOSE 5001
|
60 |
|
61 |
+
CMD ["uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
|
models_download.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import zipfile
|
3 |
+
|
4 |
+
import requests
|
5 |
+
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
6 |
+
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
7 |
+
|
8 |
+
# Download Docling models
|
9 |
+
StandardPdfPipeline.download_models_hf(force=True)
|
10 |
+
load_pretrained_nlp_models(verbose=True)
|
11 |
+
|
12 |
+
# Download EasyOCR models
|
13 |
+
urls = [
|
14 |
+
"https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip",
|
15 |
+
"https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip"
|
16 |
+
]
|
17 |
+
|
18 |
+
local_zip_paths = [
|
19 |
+
"/opt/app-root/src/latin_g2.zip",
|
20 |
+
"/opt/app-root/src/craft_mlt_25k.zip"
|
21 |
+
]
|
22 |
+
|
23 |
+
extract_path = "/opt/app-root/src/.EasyOCR/model/"
|
24 |
+
|
25 |
+
for url, local_zip_path in zip(urls, local_zip_paths):
|
26 |
+
# Download the file
|
27 |
+
response = requests.get(url)
|
28 |
+
with open(local_zip_path, "wb") as file:
|
29 |
+
file.write(response.content)
|
30 |
+
|
31 |
+
# Unzip the file
|
32 |
+
with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
|
33 |
+
zip_ref.extractall(extract_path)
|
34 |
+
|
35 |
+
# Clean up the zip file
|
36 |
+
os.remove(local_zip_path)
|
os-packages.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tesseract
|
2 |
+
tesseract-devel
|
3 |
+
tesseract-langpack-eng
|
4 |
+
leptonica-devel
|
5 |
+
libglvnd-glx
|
6 |
+
glib2
|
7 |
+
wget
|
8 |
+
git
|