Spaces:

CerealDev
/

Docling-UI

Configuration error

App Files Files Community

dolfim-ibm commited on Jan 29

Commit

1ea04fb

1 Parent(s): 8937613

make the image openshift-friendly

Browse files

Signed-off-by: Michele Dolfi <[email protected]>

Files changed (3) hide show

Containerfile +46 -17
models_download.py +36 -0
os-packages.txt +8 -0

Containerfile CHANGED Viewed

@@ -1,32 +1,61 @@
-FROM python:3.11-slim-bookworm
 ARG CPU_ONLY=false
-WORKDIR /docling-serve
-RUN apt-get update \
-    && apt-get install -y libgl1 libglib2.0-0 curl wget git \
-    && apt-get clean
-RUN pip install --no-cache-dir poetry
-COPY pyproject.toml poetry.lock README.md /docling-serve/
-RUN if [ "$CPU_ONLY" = "true" ]; then \
-    poetry install --no-root --with cpu; \
-    else \
-        poetry install --no-root; \
-    fi
-ENV HF_HOME=/tmp/
-ENV TORCH_HOME=/tmp/
-RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
-COPY ./docling_serve /docling-serve/docling_serve
 EXPOSE 5001
-CMD ["poetry", "run", "uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]

+ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s
+FROM ${BASE_IMAGE}
 ARG CPU_ONLY=false
+USER 0
+###################################################################################################
+# OS Layer                                                                                        #
+###################################################################################################
+RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
+    dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
+    dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
+    dnf config-manager --enable crb && \
+    dnf -y update && \
+    dnf install -y $(cat /tmp/os-packages.txt) && \
+    dnf -y clean all && \
+    rm -rf /var/cache/dnf
+ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
+###################################################################################################
+# Docling layer                                                                                   #
+###################################################################################################
+USER 1001
+WORKDIR /opt/app-root/src
 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
+ENV LANG=en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8
+ENV PYTHONIOENCODING=utf-8
+ENV WITH_UI=True
+COPY --chown=1001:0 pyproject.toml poetry.lock models_download.py README.md ./
+RUN pip install --no-cache-dir poetry && \
+    # We already are in a virtual environment, so we don't need to create a new one, only activate it.
+    poetry config virtualenvs.create false && \
+    source /opt/app-root/bin/activate && \
+    if [ "$CPU_ONLY" = "true" ]; then \
+        poetry install --no-root --no-cache --no-interaction --all-extras --with cpu --without dev; \
+    else \
+        poetry install --no-root --no-cache --no-interaction --all-extras --without dev; \
+    fi && \
+    echo "Downloading models..." && \
+    python models_download.py && \
+    chown -R 1001:0 /opt/app-root/src && \
+    chmod -R g=u /opt/app-root/src
+COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve
 EXPOSE 5001
+CMD ["uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]

models_download.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+import zipfile
+import requests
+from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+# Download Docling models
+StandardPdfPipeline.download_models_hf(force=True)
+load_pretrained_nlp_models(verbose=True)
+# Download EasyOCR models
+urls = [
+    "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip",
+    "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip"
+]
+local_zip_paths = [
+    "/opt/app-root/src/latin_g2.zip",
+    "/opt/app-root/src/craft_mlt_25k.zip"
+]
+extract_path = "/opt/app-root/src/.EasyOCR/model/"
+for url, local_zip_path in zip(urls, local_zip_paths):
+    # Download the file
+    response = requests.get(url)
+    with open(local_zip_path, "wb") as file:
+        file.write(response.content)
+    # Unzip the file
+    with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
+        zip_ref.extractall(extract_path)
+    # Clean up the zip file
+    os.remove(local_zip_path)

os-packages.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+tesseract
+tesseract-devel
+tesseract-langpack-eng
+leptonica-devel
+libglvnd-glx
+glib2
+wget
+git