File size: 4,865 Bytes
cfeb3a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17e3d1d
 
cfeb3a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Use the official Python 3.11 slim image for better compatibility
FROM python:3.11-slim

# Set environment variables for optimal Python and Gradio behavior
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PIP_NO_CACHE_DIR=1
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
ENV DEBIAN_FRONTEND=noninteractive

# Create app user for security (but run as root for HF Spaces compatibility)
RUN useradd --create-home --shell /bin/bash app

# Set the working directory inside the container
WORKDIR /app

# Install system dependencies required for multi-user AI application
RUN apt-get update && apt-get install -y --no-install-recommends \
    # Build tools
    build-essential \
    gcc \
    g++ \
    make \
    cmake \
    pkg-config \
    # Network and download tools
    curl \
    wget \
    git \
    # Development libraries
    libffi-dev \
    libssl-dev \
    # Image processing libraries
    libjpeg-dev \
    libpng-dev \
    libfreetype6-dev \
    libtiff5-dev \
    libopenjp2-7-dev \
    # Document processing libraries
    libxml2-dev \
    libxslt1-dev \
    zlib1g-dev \
    # OCR and PDF processing
    tesseract-ocr \
    tesseract-ocr-eng \
    poppler-utils \
    # SQLite for session storage
    sqlite3 \
    libsqlite3-dev \
    # Cleanup
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/* \
    && rm -rf /var/cache/apt/*

# Upgrade pip to latest version
RUN python -m pip install --upgrade pip setuptools wheel

# Set pip configuration for better performance and reliability  
RUN pip config set global.trusted-host "pypi.org files.pythonhosted.org pypi.python.org" \
    && pip config set global.no-cache-dir true \
    && pip config set global.disable-pip-version-check true

# Copy the requirements file first to leverage Docker's build cache
COPY requirements.txt .

# Install Python dependencies with optimizations for concurrent usage
RUN pip install --no-cache-dir --upgrade -r requirements.txt \
    && pip install --no-cache-dir \
        # Additional packages for multi-user support
        gunicorn \
        uvloop \
        # Performance monitoring
        psutil \
    && pip list --outdated

# Copy the rest of the application code
COPY . .

# Create comprehensive directory structure for multi-user application
RUN mkdir -p \
    # Core application directories
    temp logs uploads downloads cache \
    # Multi-user session directories (will be created dynamically)
    /tmp/data_extractor_temp \
    # WebSocket and terminal directories
    static \
    # Database directory for session storage
    data \
    && chmod -R 755 /app

# Set optimized permissions for multi-user concurrent access
RUN chmod -R 777 temp logs uploads downloads cache /tmp \
    && chmod -R 755 static \
    && chmod 755 app.py \
    && chmod -R 755 config utils workflow models

# Create non-root user but keep root permissions for HF Spaces
RUN chown -R app:app /app \
    && chown -R app:app /tmp/data_extractor_temp

# Set comprehensive environment variables for multi-user application
ENV PYTHONPATH=/app
ENV GRADIO_SERVER_NAME=0.0.0.0
ENV GRADIO_SERVER_PORT=7860
# ENV GRADIO_SHARE=false
# ENV GRADIO_DEBUG=false

# Matplotlib configuration for headless operation
ENV MPLBACKEND=Agg
ENV MPLCONFIGDIR=/tmp/mpl_cache

# Optimize for multi-user concurrent access
ENV GRADIO_QUEUE_DEFAULT_CONCURRENCY=10
ENV GRADIO_MAX_THREADS=20

# WebSocket and networking configuration  
ENV WEBSOCKET_HOST=0.0.0.0
ENV WEBSOCKET_PORT=8765

# Session and temporary file configuration
ENV TEMP_DIR=/tmp/data_extractor_temp
ENV SESSION_TIMEOUT=1800
ENV MAX_FILE_SIZE_MB=50

# AI model configuration (will be overridden by user env vars)
ENV COORDINATOR_MODEL=gemini-2.5-pro
ENV DATA_EXTRACTOR_MODEL=gemini-2.5-pro
ENV DATA_ARRANGER_MODEL=gemini-2.5-pro
ENV CODE_GENERATOR_MODEL=gemini-2.5-pro

# Security and performance settings
ENV PYTHONSAFEPATH=1
ENV PYTHONHASHSEED=random

# Expose the port that the Gradio application will run on
EXPOSE 7860
EXPOSE 8765

# Health check for container monitoring
HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:7860/ || exit 1

# Run as root for Hugging Face Spaces compatibility
USER root

# Create startup script for better error handling and logging
RUN echo '#!/bin/bash\n\
set -e\n\
echo "πŸš€ Starting Data Extractor Multi-User Application..."\n\
echo "πŸ“Š Python version: $(python --version)"\n\
echo "🌐 Server: 0.0.0.0:7860"\n\
echo "πŸ‘₯ Multi-user concurrency: Enabled"\n\
echo "πŸ”’ Session isolation: Active"\n\
echo "πŸ’Ύ Temp directory: $TEMP_DIR"\n\
\n\
# Create runtime directories\n\
mkdir -p "$TEMP_DIR"\n\
mkdir -p /tmp/mpl_cache\n\
chmod 777 "$TEMP_DIR" /tmp/mpl_cache\n\
\n\
# Start the application\n\
exec python app.py\n\
' > /app/start.sh && chmod +x /app/start.sh

# The command to run when the container starts
CMD ["/app/start.sh"]