Spaces:

Chaitanya895
/

bangla-translator

Sleeping

App Files Files Community

bangla-translator / Dockerfile

Chaitanya895

Update Dockerfile

4e7abd2 verified 3 months ago

raw

history blame contribute delete

3.34 kB

	# Use Python 3.10.14 slim for reproducibility and compatibility
	FROM python:3.10.14-slim

	# Set working directory early to avoid repeated layer creation
	WORKDIR /app

	# Update package sources to include contrib and non-free repositories
	RUN echo "deb http://deb.debian.org/debian bookworm main contrib non-free" > /etc/apt/sources.list && \
	echo "deb http://deb.debian.org/debian-security bookworm-security main contrib non-free" >> /etc/apt/sources.list && \
	echo "deb http://deb.debian.org/debian bookworm-updates main contrib non-free" >> /etc/apt/sources.list

	# Install system dependencies in a single layer to reduce size
	RUN apt-get update && apt-get install -y --no-install-recommends \
	tesseract-ocr=5.3.* \
	tesseract-ocr-ben \
	poppler-utils \
	libpoppler-dev \
	curl \
	chromium \
	chromium-driver \
	libjpeg-dev \
	zlib1g-dev \
	libpng-dev \
	gcc \
	python3-dev \
	libpython3-dev \
	g++ \
	procps \
	&& apt-get clean \
	&& rm -rf /var/lib/apt/lists/*

	# Upgrade pip to latest version
	RUN pip install --no-cache-dir --upgrade pip==25.1.1

	# Ensure tessdata directory and download ben.traineddata if missing
	RUN mkdir -p /usr/share/tesseract-ocr/5/tessdata && \
	if [ ! -f /usr/share/tesseract-ocr/5/tessdata/ben.traineddata ]; then \
	curl -L -o /usr/share/tesseract-ocr/5/tessdata/ben.traineddata \
	https://github.com/tesseract-ocr/tessdata_best/raw/main/ben.traineddata; \
	fi

	# Set TESSDATA_PREFIX
	ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata

	# Ensure chromedriver is available
	RUN ln -sf /usr/lib/chromium/chromedriver /usr/bin/chromedriver \|\| \
	( [ -f /usr/bin/chromedriver ] && echo "chromedriver found" ) \|\| \
	( echo "chromedriver not found" && exit 1 )

	# Set environment variables for Chromium and chromedriver
	ENV CHROMIUM_PATH=/usr/bin/chromium \
	CHROMEDRIVER_PATH=/usr/bin/chromedriver \
	SELENIUM_HEADLESS=1

	# Set permissions for non-root user
	RUN chmod -R 777 /app && chown -R nobody:nogroup /app

	# Copy and install requirements
	COPY requirements.txt .
	RUN pip install --no-cache-dir -r requirements.txt && \
	find /usr/local/lib/python3.10 -name "*.pyc" -delete

	# Set Hugging Face cache and create model directory
	ENV HF_HOME=/data/models \
	HF_HUB_CACHE=/data/models
	RUN mkdir -p /data/models && chmod -R 777 /data/models

	# Pre-download the model with retries
	RUN for attempt in 1 2 3; do \
	echo "Attempt $attempt to download model..." && \
	python -c "import os; os.environ['TRANSFORMERS_VERBOSITY']='error'; \
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM; \
	AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-bn-en', cache_dir='/data/models'); \
	AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-bn-en', cache_dir='/data/models');" && \
	break \|\| sleep 10; \
	done \|\| ( echo "Model download failed after 3 attempts" && exit 1 )

	# Copy application files
	COPY app.py init_db.py db_utils.py ./
	COPY templates/ templates/
	COPY static/ static/

	# Create cache directory for OCR
	RUN mkdir -p /tmp/ocr_cache && chmod -R 777 /tmp/ocr_cache

	# Expose port
	EXPOSE 7860

	# Run with gunicorn, single worker, 900-second timeout
	CMD ["gunicorn", "--workers", "1", "--bind", "0.0.0.0:7860", "--preload", "--timeout", "900", "--log-level", "info", "app:app"]