dolfim-ibm commited on
Commit
1ea04fb
·
1 Parent(s): 8937613

make the image openshift-friendly

Browse files

Signed-off-by: Michele Dolfi <[email protected]>

Files changed (3) hide show
  1. Containerfile +46 -17
  2. models_download.py +36 -0
  3. os-packages.txt +8 -0
Containerfile CHANGED
@@ -1,32 +1,61 @@
1
- FROM python:3.11-slim-bookworm
 
 
2
 
3
  ARG CPU_ONLY=false
4
- WORKDIR /docling-serve
5
 
6
- RUN apt-get update \
7
- && apt-get install -y libgl1 libglib2.0-0 curl wget git \
8
- && apt-get clean
9
 
10
- RUN pip install --no-cache-dir poetry
 
 
11
 
12
- COPY pyproject.toml poetry.lock README.md /docling-serve/
 
 
 
 
 
 
 
13
 
14
- RUN if [ "$CPU_ONLY" = "true" ]; then \
15
- poetry install --no-root --with cpu; \
16
- else \
17
- poetry install --no-root; \
18
- fi
19
 
20
- ENV HF_HOME=/tmp/
21
- ENV TORCH_HOME=/tmp/
 
22
 
23
- RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
 
 
24
 
25
  # On container environments, always set a thread budget to avoid undesired thread congestion.
26
  ENV OMP_NUM_THREADS=4
27
 
28
- COPY ./docling_serve /docling-serve/docling_serve
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  EXPOSE 5001
31
 
32
- CMD ["poetry", "run", "uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
 
1
+ ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s
2
+
3
+ FROM ${BASE_IMAGE}
4
 
5
  ARG CPU_ONLY=false
 
6
 
7
+ USER 0
 
 
8
 
9
+ ###################################################################################################
10
+ # OS Layer #
11
+ ###################################################################################################
12
 
13
+ RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
14
+ dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
15
+ dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
16
+ dnf config-manager --enable crb && \
17
+ dnf -y update && \
18
+ dnf install -y $(cat /tmp/os-packages.txt) && \
19
+ dnf -y clean all && \
20
+ rm -rf /var/cache/dnf
21
 
22
+ ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
 
 
 
 
23
 
24
+ ###################################################################################################
25
+ # Docling layer #
26
+ ###################################################################################################
27
 
28
+ USER 1001
29
+
30
+ WORKDIR /opt/app-root/src
31
 
32
  # On container environments, always set a thread budget to avoid undesired thread congestion.
33
  ENV OMP_NUM_THREADS=4
34
 
35
+ ENV LANG=en_US.UTF-8
36
+ ENV LC_ALL=en_US.UTF-8
37
+ ENV PYTHONIOENCODING=utf-8
38
+
39
+ ENV WITH_UI=True
40
+
41
+ COPY --chown=1001:0 pyproject.toml poetry.lock models_download.py README.md ./
42
+
43
+ RUN pip install --no-cache-dir poetry && \
44
+ # We already are in a virtual environment, so we don't need to create a new one, only activate it.
45
+ poetry config virtualenvs.create false && \
46
+ source /opt/app-root/bin/activate && \
47
+ if [ "$CPU_ONLY" = "true" ]; then \
48
+ poetry install --no-root --no-cache --no-interaction --all-extras --with cpu --without dev; \
49
+ else \
50
+ poetry install --no-root --no-cache --no-interaction --all-extras --without dev; \
51
+ fi && \
52
+ echo "Downloading models..." && \
53
+ python models_download.py && \
54
+ chown -R 1001:0 /opt/app-root/src && \
55
+ chmod -R g=u /opt/app-root/src
56
+
57
+ COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve
58
 
59
  EXPOSE 5001
60
 
61
+ CMD ["uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
models_download.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+
4
+ import requests
5
+ from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
6
+ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
7
+
8
+ # Download Docling models
9
+ StandardPdfPipeline.download_models_hf(force=True)
10
+ load_pretrained_nlp_models(verbose=True)
11
+
12
+ # Download EasyOCR models
13
+ urls = [
14
+ "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip",
15
+ "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip"
16
+ ]
17
+
18
+ local_zip_paths = [
19
+ "/opt/app-root/src/latin_g2.zip",
20
+ "/opt/app-root/src/craft_mlt_25k.zip"
21
+ ]
22
+
23
+ extract_path = "/opt/app-root/src/.EasyOCR/model/"
24
+
25
+ for url, local_zip_path in zip(urls, local_zip_paths):
26
+ # Download the file
27
+ response = requests.get(url)
28
+ with open(local_zip_path, "wb") as file:
29
+ file.write(response.content)
30
+
31
+ # Unzip the file
32
+ with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
33
+ zip_ref.extractall(extract_path)
34
+
35
+ # Clean up the zip file
36
+ os.remove(local_zip_path)
os-packages.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ tesseract
2
+ tesseract-devel
3
+ tesseract-langpack-eng
4
+ leptonica-devel
5
+ libglvnd-glx
6
+ glib2
7
+ wget
8
+ git