File size: 4,865 Bytes
cfeb3a6 17e3d1d cfeb3a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# Use the official Python 3.11 slim image for better compatibility
FROM python:3.11-slim
# Set environment variables for optimal Python and Gradio behavior
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PIP_NO_CACHE_DIR=1
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
ENV DEBIAN_FRONTEND=noninteractive
# Create app user for security (but run as root for HF Spaces compatibility)
RUN useradd --create-home --shell /bin/bash app
# Set the working directory inside the container
WORKDIR /app
# Install system dependencies required for multi-user AI application
RUN apt-get update && apt-get install -y --no-install-recommends \
# Build tools
build-essential \
gcc \
g++ \
make \
cmake \
pkg-config \
# Network and download tools
curl \
wget \
git \
# Development libraries
libffi-dev \
libssl-dev \
# Image processing libraries
libjpeg-dev \
libpng-dev \
libfreetype6-dev \
libtiff5-dev \
libopenjp2-7-dev \
# Document processing libraries
libxml2-dev \
libxslt1-dev \
zlib1g-dev \
# OCR and PDF processing
tesseract-ocr \
tesseract-ocr-eng \
poppler-utils \
# SQLite for session storage
sqlite3 \
libsqlite3-dev \
# Cleanup
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /var/cache/apt/*
# Upgrade pip to latest version
RUN python -m pip install --upgrade pip setuptools wheel
# Set pip configuration for better performance and reliability
RUN pip config set global.trusted-host "pypi.org files.pythonhosted.org pypi.python.org" \
&& pip config set global.no-cache-dir true \
&& pip config set global.disable-pip-version-check true
# Copy the requirements file first to leverage Docker's build cache
COPY requirements.txt .
# Install Python dependencies with optimizations for concurrent usage
RUN pip install --no-cache-dir --upgrade -r requirements.txt \
&& pip install --no-cache-dir \
# Additional packages for multi-user support
gunicorn \
uvloop \
# Performance monitoring
psutil \
&& pip list --outdated
# Copy the rest of the application code
COPY . .
# Create comprehensive directory structure for multi-user application
RUN mkdir -p \
# Core application directories
temp logs uploads downloads cache \
# Multi-user session directories (will be created dynamically)
/tmp/data_extractor_temp \
# WebSocket and terminal directories
static \
# Database directory for session storage
data \
&& chmod -R 755 /app
# Set optimized permissions for multi-user concurrent access
RUN chmod -R 777 temp logs uploads downloads cache /tmp \
&& chmod -R 755 static \
&& chmod 755 app.py \
&& chmod -R 755 config utils workflow models
# Create non-root user but keep root permissions for HF Spaces
RUN chown -R app:app /app \
&& chown -R app:app /tmp/data_extractor_temp
# Set comprehensive environment variables for multi-user application
ENV PYTHONPATH=/app
ENV GRADIO_SERVER_NAME=0.0.0.0
ENV GRADIO_SERVER_PORT=7860
# ENV GRADIO_SHARE=false
# ENV GRADIO_DEBUG=false
# Matplotlib configuration for headless operation
ENV MPLBACKEND=Agg
ENV MPLCONFIGDIR=/tmp/mpl_cache
# Optimize for multi-user concurrent access
ENV GRADIO_QUEUE_DEFAULT_CONCURRENCY=10
ENV GRADIO_MAX_THREADS=20
# WebSocket and networking configuration
ENV WEBSOCKET_HOST=0.0.0.0
ENV WEBSOCKET_PORT=8765
# Session and temporary file configuration
ENV TEMP_DIR=/tmp/data_extractor_temp
ENV SESSION_TIMEOUT=1800
ENV MAX_FILE_SIZE_MB=50
# AI model configuration (will be overridden by user env vars)
ENV COORDINATOR_MODEL=gemini-2.5-pro
ENV DATA_EXTRACTOR_MODEL=gemini-2.5-pro
ENV DATA_ARRANGER_MODEL=gemini-2.5-pro
ENV CODE_GENERATOR_MODEL=gemini-2.5-pro
# Security and performance settings
ENV PYTHONSAFEPATH=1
ENV PYTHONHASHSEED=random
# Expose the port that the Gradio application will run on
EXPOSE 7860
EXPOSE 8765
# Health check for container monitoring
HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
CMD curl -f http://localhost:7860/ || exit 1
# Run as root for Hugging Face Spaces compatibility
USER root
# Create startup script for better error handling and logging
RUN echo '#!/bin/bash\n\
set -e\n\
echo "π Starting Data Extractor Multi-User Application..."\n\
echo "π Python version: $(python --version)"\n\
echo "π Server: 0.0.0.0:7860"\n\
echo "π₯ Multi-user concurrency: Enabled"\n\
echo "π Session isolation: Active"\n\
echo "πΎ Temp directory: $TEMP_DIR"\n\
\n\
# Create runtime directories\n\
mkdir -p "$TEMP_DIR"\n\
mkdir -p /tmp/mpl_cache\n\
chmod 777 "$TEMP_DIR" /tmp/mpl_cache\n\
\n\
# Start the application\n\
exec python app.py\n\
' > /app/start.sh && chmod +x /app/start.sh
# The command to run when the container starts
CMD ["/app/start.sh"]
|