bangla-translator / Dockerfile
Chaitanya895's picture
Update Dockerfile
4e7abd2 verified
# Use Python 3.10.14 slim for reproducibility and compatibility
FROM python:3.10.14-slim
# Set working directory early to avoid repeated layer creation
WORKDIR /app
# Update package sources to include contrib and non-free repositories
RUN echo "deb http://deb.debian.org/debian bookworm main contrib non-free" > /etc/apt/sources.list && \
echo "deb http://deb.debian.org/debian-security bookworm-security main contrib non-free" >> /etc/apt/sources.list && \
echo "deb http://deb.debian.org/debian bookworm-updates main contrib non-free" >> /etc/apt/sources.list
# Install system dependencies in a single layer to reduce size
RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr=5.3.* \
tesseract-ocr-ben \
poppler-utils \
libpoppler-dev \
curl \
chromium \
chromium-driver \
libjpeg-dev \
zlib1g-dev \
libpng-dev \
gcc \
python3-dev \
libpython3-dev \
g++ \
procps \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Upgrade pip to latest version
RUN pip install --no-cache-dir --upgrade pip==25.1.1
# Ensure tessdata directory and download ben.traineddata if missing
RUN mkdir -p /usr/share/tesseract-ocr/5/tessdata && \
if [ ! -f /usr/share/tesseract-ocr/5/tessdata/ben.traineddata ]; then \
curl -L -o /usr/share/tesseract-ocr/5/tessdata/ben.traineddata \
https://github.com/tesseract-ocr/tessdata_best/raw/main/ben.traineddata; \
fi
# Set TESSDATA_PREFIX
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
# Ensure chromedriver is available
RUN ln -sf /usr/lib/chromium/chromedriver /usr/bin/chromedriver || \
( [ -f /usr/bin/chromedriver ] && echo "chromedriver found" ) || \
( echo "chromedriver not found" && exit 1 )
# Set environment variables for Chromium and chromedriver
ENV CHROMIUM_PATH=/usr/bin/chromium \
CHROMEDRIVER_PATH=/usr/bin/chromedriver \
SELENIUM_HEADLESS=1
# Set permissions for non-root user
RUN chmod -R 777 /app && chown -R nobody:nogroup /app
# Copy and install requirements
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && \
find /usr/local/lib/python3.10 -name "*.pyc" -delete
# Set Hugging Face cache and create model directory
ENV HF_HOME=/data/models \
HF_HUB_CACHE=/data/models
RUN mkdir -p /data/models && chmod -R 777 /data/models
# Pre-download the model with retries
RUN for attempt in 1 2 3; do \
echo "Attempt $attempt to download model..." && \
python -c "import os; os.environ['TRANSFORMERS_VERBOSITY']='error'; \
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM; \
AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-bn-en', cache_dir='/data/models'); \
AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-bn-en', cache_dir='/data/models');" && \
break || sleep 10; \
done || ( echo "Model download failed after 3 attempts" && exit 1 )
# Copy application files
COPY app.py init_db.py db_utils.py ./
COPY templates/ templates/
COPY static/ static/
# Create cache directory for OCR
RUN mkdir -p /tmp/ocr_cache && chmod -R 777 /tmp/ocr_cache
# Expose port
EXPOSE 7860
# Run with gunicorn, single worker, 900-second timeout
CMD ["gunicorn", "--workers", "1", "--bind", "0.0.0.0:7860", "--preload", "--timeout", "900", "--log-level", "info", "app:app"]