Spaces:
Sleeping
Sleeping
# Use Python 3.10.14 slim for reproducibility and compatibility | |
FROM python:3.10.14-slim | |
# Set working directory early to avoid repeated layer creation | |
WORKDIR /app | |
# Update package sources to include contrib and non-free repositories | |
RUN echo "deb http://deb.debian.org/debian bookworm main contrib non-free" > /etc/apt/sources.list && \ | |
echo "deb http://deb.debian.org/debian-security bookworm-security main contrib non-free" >> /etc/apt/sources.list && \ | |
echo "deb http://deb.debian.org/debian bookworm-updates main contrib non-free" >> /etc/apt/sources.list | |
# Install system dependencies in a single layer to reduce size | |
RUN apt-get update && apt-get install -y --no-install-recommends \ | |
tesseract-ocr=5.3.* \ | |
tesseract-ocr-ben \ | |
poppler-utils \ | |
libpoppler-dev \ | |
curl \ | |
chromium \ | |
chromium-driver \ | |
libjpeg-dev \ | |
zlib1g-dev \ | |
libpng-dev \ | |
gcc \ | |
python3-dev \ | |
libpython3-dev \ | |
g++ \ | |
procps \ | |
&& apt-get clean \ | |
&& rm -rf /var/lib/apt/lists/* | |
# Upgrade pip to latest version | |
RUN pip install --no-cache-dir --upgrade pip==25.1.1 | |
# Ensure tessdata directory and download ben.traineddata if missing | |
RUN mkdir -p /usr/share/tesseract-ocr/5/tessdata && \ | |
if [ ! -f /usr/share/tesseract-ocr/5/tessdata/ben.traineddata ]; then \ | |
curl -L -o /usr/share/tesseract-ocr/5/tessdata/ben.traineddata \ | |
https://github.com/tesseract-ocr/tessdata_best/raw/main/ben.traineddata; \ | |
fi | |
# Set TESSDATA_PREFIX | |
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata | |
# Ensure chromedriver is available | |
RUN ln -sf /usr/lib/chromium/chromedriver /usr/bin/chromedriver || \ | |
( [ -f /usr/bin/chromedriver ] && echo "chromedriver found" ) || \ | |
( echo "chromedriver not found" && exit 1 ) | |
# Set environment variables for Chromium and chromedriver | |
ENV CHROMIUM_PATH=/usr/bin/chromium \ | |
CHROMEDRIVER_PATH=/usr/bin/chromedriver \ | |
SELENIUM_HEADLESS=1 | |
# Set permissions for non-root user | |
RUN chmod -R 777 /app && chown -R nobody:nogroup /app | |
# Copy and install requirements | |
COPY requirements.txt . | |
RUN pip install --no-cache-dir -r requirements.txt && \ | |
find /usr/local/lib/python3.10 -name "*.pyc" -delete | |
# Set Hugging Face cache and create model directory | |
ENV HF_HOME=/data/models \ | |
HF_HUB_CACHE=/data/models | |
RUN mkdir -p /data/models && chmod -R 777 /data/models | |
# Pre-download the model with retries | |
RUN for attempt in 1 2 3; do \ | |
echo "Attempt $attempt to download model..." && \ | |
python -c "import os; os.environ['TRANSFORMERS_VERBOSITY']='error'; \ | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM; \ | |
AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-bn-en', cache_dir='/data/models'); \ | |
AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-bn-en', cache_dir='/data/models');" && \ | |
break || sleep 10; \ | |
done || ( echo "Model download failed after 3 attempts" && exit 1 ) | |
# Copy application files | |
COPY app.py init_db.py db_utils.py ./ | |
COPY templates/ templates/ | |
COPY static/ static/ | |
# Create cache directory for OCR | |
RUN mkdir -p /tmp/ocr_cache && chmod -R 777 /tmp/ocr_cache | |
# Expose port | |
EXPOSE 7860 | |
# Run with gunicorn, single worker, 900-second timeout | |
CMD ["gunicorn", "--workers", "1", "--bind", "0.0.0.0:7860", "--preload", "--timeout", "900", "--log-level", "info", "app:app"] |