docling / Dockerfile
levalencia's picture
Update Dockerfile to include test_permissions.py and enhance startup script for environment verification
442515d
FROM python:3.9-slim
WORKDIR /app
# Set environment variables to prevent root filesystem access
ENV TEMP_DIR=/tmp/docling_temp
ENV HOME=/tmp/docling_temp
ENV USERPROFILE=/tmp/docling_temp
ENV TMPDIR=/tmp/docling_temp
ENV TEMP=/tmp/docling_temp
ENV TMP=/tmp/docling_temp
# Hugging Face Hub configuration - CRITICAL for preventing /.cache access
ENV HF_HOME=/tmp/docling_temp/huggingface
ENV HF_CACHE_HOME=/tmp/docling_temp/huggingface_cache
ENV HF_HUB_CACHE=/tmp/docling_temp/huggingface_cache
ENV TRANSFORMERS_CACHE=/tmp/docling_temp/transformers_cache
ENV HF_DATASETS_CACHE=/tmp/docling_temp/datasets_cache
ENV DIFFUSERS_CACHE=/tmp/docling_temp/diffusers_cache
ENV ACCELERATE_CACHE=/tmp/docling_temp/accelerate_cache
# Additional Hugging Face specific variables
ENV HF_HUB_DISABLE_TELEMETRY=1
ENV HF_HUB_DISABLE_IMPLICIT_TOKEN=1
ENV HF_HUB_OFFLINE=0
# Other ML libraries
ENV TORCH_HOME=/tmp/docling_temp/torch
ENV TENSORFLOW_HOME=/tmp/docling_temp/tensorflow
ENV KERAS_HOME=/tmp/docling_temp/keras
# XDG directories
ENV XDG_CACHE_HOME=/tmp/docling_temp/cache
ENV XDG_CONFIG_HOME=/tmp/docling_temp/config
ENV XDG_DATA_HOME=/tmp/docling_temp/data
# EasyOCR configuration
ENV EASYOCR_MODULE_PATH=/tmp/docling_temp/easyocr_models
# Additional cache directories
ENV CACHE_DIR=/tmp/docling_temp/cache
ENV MODEL_CACHE_DIR=/tmp/docling_temp/models
ENV CACHE=/tmp/docling_temp/cache
ENV MODELS=/tmp/docling_temp/models
ENV DATA=/tmp/docling_temp/data
ENV CONFIG=/tmp/docling_temp/config
# Python path
ENV PYTHONPATH=/tmp/docling_temp
RUN apt-get update && apt-get install -y \
build-essential \
curl \
software-properties-common \
git \
&& rm -rf /var/lib/apt/lists/*
# Create necessary directories with proper permissions
RUN mkdir -p /app/.streamlit /tmp/docling_temp /tmp/easyocr_models /tmp/cache /tmp/config /tmp/data /tmp/huggingface /tmp/huggingface_cache /tmp/transformers_cache /tmp/datasets_cache /tmp/torch /tmp/tensorflow /tmp/keras /tmp/accelerate_cache /tmp/diffusers_cache /tmp/models && \
chmod 755 /app/.streamlit && \
chmod 777 /tmp/docling_temp && \
chmod 777 /tmp/easyocr_models && \
chmod 777 /tmp/cache && \
chmod 777 /tmp/config && \
chmod 777 /tmp/data && \
chmod 777 /tmp/huggingface && \
chmod 777 /tmp/huggingface_cache && \
chmod 777 /tmp/transformers_cache && \
chmod 777 /tmp/datasets_cache && \
chmod 777 /tmp/torch && \
chmod 777 /tmp/tensorflow && \
chmod 777 /tmp/keras && \
chmod 777 /tmp/accelerate_cache && \
chmod 777 /tmp/diffusers_cache && \
chmod 777 /tmp/models
COPY requirements.txt ./
COPY pyproject.toml ./
COPY src/ ./src/
COPY test_permissions.py ./
COPY README.md ./
# Create Streamlit config directly in Dockerfile to avoid copy issues
RUN echo '[global]' > /app/.streamlit/config.toml && \
echo 'developmentMode = false' >> /app/.streamlit/config.toml && \
echo '' >> /app/.streamlit/config.toml && \
echo '[server]' >> /app/.streamlit/config.toml && \
echo 'fileWatcherType = "none"' >> /app/.streamlit/config.toml && \
echo 'headless = true' >> /app/.streamlit/config.toml && \
echo 'enableCORS = false' >> /app/.streamlit/config.toml && \
echo 'enableXsrfProtection = false' >> /app/.streamlit/config.toml && \
echo '' >> /app/.streamlit/config.toml && \
echo '[browser]' >> /app/.streamlit/config.toml && \
echo 'gatherUsageStats = false' >> /app/.streamlit/config.toml && \
echo 'serverAddress = "0.0.0.0"' >> /app/.streamlit/config.toml && \
echo 'serverPort = 8501' >> /app/.streamlit/config.toml && \
echo '' >> /app/.streamlit/config.toml && \
echo '[theme]' >> /app/.streamlit/config.toml && \
echo 'primaryColor = "#1f77b4"' >> /app/.streamlit/config.toml && \
echo 'backgroundColor = "#ffffff"' >> /app/.streamlit/config.toml && \
echo 'secondaryBackgroundColor = "#f0f2f6"' >> /app/.streamlit/config.toml && \
echo 'textColor = "#262730"' >> /app/.streamlit/config.toml
RUN pip3 install -r requirements.txt
EXPOSE 8501
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
# Create a startup script to ensure environment variables are set
RUN echo '#!/bin/bash' > /app/start.sh && \
echo 'export TEMP_DIR=/tmp/docling_temp' >> /app/start.sh && \
echo 'export HOME=/tmp/docling_temp' >> /app/start.sh && \
echo 'export USERPROFILE=/tmp/docling_temp' >> /app/start.sh && \
echo 'export TMPDIR=/tmp/docling_temp' >> /app/start.sh && \
echo 'export TEMP=/tmp/docling_temp' >> /app/start.sh && \
echo 'export TMP=/tmp/docling_temp' >> /app/start.sh && \
echo 'export HF_HOME=/tmp/docling_temp/huggingface' >> /app/start.sh && \
echo 'export HF_CACHE_HOME=/tmp/docling_temp/huggingface_cache' >> /app/start.sh && \
echo 'export HF_HUB_CACHE=/tmp/docling_temp/huggingface_cache' >> /app/start.sh && \
echo 'export TRANSFORMERS_CACHE=/tmp/docling_temp/transformers_cache' >> /app/start.sh && \
echo 'export HF_DATASETS_CACHE=/tmp/docling_temp/datasets_cache' >> /app/start.sh && \
echo 'export DIFFUSERS_CACHE=/tmp/docling_temp/diffusers_cache' >> /app/start.sh && \
echo 'export ACCELERATE_CACHE=/tmp/docling_temp/accelerate_cache' >> /app/start.sh && \
echo 'export HF_HUB_DISABLE_TELEMETRY=1' >> /app/start.sh && \
echo 'export HF_HUB_DISABLE_IMPLICIT_TOKEN=1' >> /app/start.sh && \
echo 'export HF_HUB_OFFLINE=0' >> /app/start.sh && \
echo 'export TORCH_HOME=/tmp/docling_temp/torch' >> /app/start.sh && \
echo 'export TENSORFLOW_HOME=/tmp/docling_temp/tensorflow' >> /app/start.sh && \
echo 'export KERAS_HOME=/tmp/docling_temp/keras' >> /app/start.sh && \
echo 'export XDG_CACHE_HOME=/tmp/docling_temp/cache' >> /app/start.sh && \
echo 'export XDG_CONFIG_HOME=/tmp/docling_temp/config' >> /app/start.sh && \
echo 'export XDG_DATA_HOME=/tmp/docling_temp/data' >> /app/start.sh && \
echo 'export EASYOCR_MODULE_PATH=/tmp/docling_temp/easyocr_models' >> /app/start.sh && \
echo 'export CACHE_DIR=/tmp/docling_temp/cache' >> /app/start.sh && \
echo 'export MODEL_CACHE_DIR=/tmp/docling_temp/models' >> /app/start.sh && \
echo 'export CACHE=/tmp/docling_temp/cache' >> /app/start.sh && \
echo 'export MODELS=/tmp/docling_temp/models' >> /app/start.sh && \
echo 'export DATA=/tmp/docling_temp/data' >> /app/start.sh && \
echo 'export CONFIG=/tmp/docling_temp/config' >> /app/start.sh && \
echo 'export PYTHONPATH=/tmp/docling_temp' >> /app/start.sh && \
echo 'echo "Environment variables set for Hugging Face Hub cache directories"' >> /app/start.sh && \
echo 'echo "HF_HUB_CACHE: $HF_HUB_CACHE"' >> /app/start.sh && \
echo 'echo "HF_CACHE_HOME: $HF_CACHE_HOME"' >> /app/start.sh && \
echo 'echo "TEMP_DIR: $TEMP_DIR"' >> /app/start.sh && \
echo 'echo "Running environment test..."' >> /app/start.sh && \
echo 'python test_permissions.py' >> /app/start.sh && \
echo 'if [ $? -eq 0 ]; then' >> /app/start.sh && \
echo ' echo "Environment test passed, starting Streamlit app..."' >> /app/start.sh && \
echo ' exec streamlit run src/streamlit_app.py --server.port=8501 --server.address=0.0.0.0' >> /app/start.sh && \
echo 'else' >> /app/start.sh && \
echo ' echo "Environment test failed, exiting..."' >> /app/start.sh && \
echo ' exit 1' >> /app/start.sh && \
echo 'fi' >> /app/start.sh && \
chmod +x /app/start.sh
ENTRYPOINT ["/app/start.sh"]