levalencia commited on
Commit
98aae70
Β·
1 Parent(s): 3db2fae

Enhance Dockerfile and Streamlit app for comprehensive environment setup and permission testing

Browse files

- Implemented extensive environment variable configurations to redirect cache directories for Hugging Face and other ML libraries to writable locations.
- Added a startup script in the Dockerfile to ensure environment variables are set before the application runs.
- Updated the Streamlit app to configure environment variables immediately and perform necessary directory creation with error handling.
- Introduced a test script to verify environment setup and permissions, ensuring the application runs smoothly in Hugging Face Spaces.

Dockerfile CHANGED
@@ -2,6 +2,52 @@ FROM python:3.9-slim
2
 
3
  WORKDIR /app
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  RUN apt-get update && apt-get install -y \
6
  build-essential \
7
  curl \
@@ -10,7 +56,7 @@ RUN apt-get update && apt-get install -y \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  # Create necessary directories with proper permissions
13
- RUN mkdir -p /app/.streamlit /tmp/docling_temp /tmp/easyocr_models /tmp/cache /tmp/config /tmp/data /tmp/huggingface /tmp/huggingface_cache /tmp/transformers_cache /tmp/datasets_cache /tmp/torch /tmp/tensorflow /tmp/keras && \
14
  chmod 755 /app/.streamlit && \
15
  chmod 777 /tmp/docling_temp && \
16
  chmod 777 /tmp/easyocr_models && \
@@ -23,7 +69,10 @@ RUN mkdir -p /app/.streamlit /tmp/docling_temp /tmp/easyocr_models /tmp/cache /t
23
  chmod 777 /tmp/datasets_cache && \
24
  chmod 777 /tmp/torch && \
25
  chmod 777 /tmp/tensorflow && \
26
- chmod 777 /tmp/keras
 
 
 
27
 
28
  COPY requirements.txt ./
29
  COPY src/ ./src/
@@ -55,4 +104,51 @@ EXPOSE 8501
55
 
56
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
57
 
58
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  WORKDIR /app
4
 
5
+ # Set environment variables to prevent root filesystem access
6
+ ENV TEMP_DIR=/tmp/docling_temp
7
+ ENV HOME=/tmp/docling_temp
8
+ ENV USERPROFILE=/tmp/docling_temp
9
+ ENV TMPDIR=/tmp/docling_temp
10
+ ENV TEMP=/tmp/docling_temp
11
+ ENV TMP=/tmp/docling_temp
12
+
13
+ # Hugging Face Hub configuration - CRITICAL for preventing /.cache access
14
+ ENV HF_HOME=/tmp/docling_temp/huggingface
15
+ ENV HF_CACHE_HOME=/tmp/docling_temp/huggingface_cache
16
+ ENV HF_HUB_CACHE=/tmp/docling_temp/huggingface_cache
17
+ ENV TRANSFORMERS_CACHE=/tmp/docling_temp/transformers_cache
18
+ ENV HF_DATASETS_CACHE=/tmp/docling_temp/datasets_cache
19
+ ENV DIFFUSERS_CACHE=/tmp/docling_temp/diffusers_cache
20
+ ENV ACCELERATE_CACHE=/tmp/docling_temp/accelerate_cache
21
+
22
+ # Additional Hugging Face specific variables
23
+ ENV HF_HUB_DISABLE_TELEMETRY=1
24
+ ENV HF_HUB_DISABLE_IMPLICIT_TOKEN=1
25
+ ENV HF_HUB_OFFLINE=0
26
+
27
+ # Other ML libraries
28
+ ENV TORCH_HOME=/tmp/docling_temp/torch
29
+ ENV TENSORFLOW_HOME=/tmp/docling_temp/tensorflow
30
+ ENV KERAS_HOME=/tmp/docling_temp/keras
31
+
32
+ # XDG directories
33
+ ENV XDG_CACHE_HOME=/tmp/docling_temp/cache
34
+ ENV XDG_CONFIG_HOME=/tmp/docling_temp/config
35
+ ENV XDG_DATA_HOME=/tmp/docling_temp/data
36
+
37
+ # EasyOCR configuration
38
+ ENV EASYOCR_MODULE_PATH=/tmp/docling_temp/easyocr_models
39
+
40
+ # Additional cache directories
41
+ ENV CACHE_DIR=/tmp/docling_temp/cache
42
+ ENV MODEL_CACHE_DIR=/tmp/docling_temp/models
43
+ ENV CACHE=/tmp/docling_temp/cache
44
+ ENV MODELS=/tmp/docling_temp/models
45
+ ENV DATA=/tmp/docling_temp/data
46
+ ENV CONFIG=/tmp/docling_temp/config
47
+
48
+ # Python path
49
+ ENV PYTHONPATH=/tmp/docling_temp
50
+
51
  RUN apt-get update && apt-get install -y \
52
  build-essential \
53
  curl \
 
56
  && rm -rf /var/lib/apt/lists/*
57
 
58
  # Create necessary directories with proper permissions
59
+ RUN mkdir -p /app/.streamlit /tmp/docling_temp /tmp/easyocr_models /tmp/cache /tmp/config /tmp/data /tmp/huggingface /tmp/huggingface_cache /tmp/transformers_cache /tmp/datasets_cache /tmp/torch /tmp/tensorflow /tmp/keras /tmp/accelerate_cache /tmp/diffusers_cache /tmp/models && \
60
  chmod 755 /app/.streamlit && \
61
  chmod 777 /tmp/docling_temp && \
62
  chmod 777 /tmp/easyocr_models && \
 
69
  chmod 777 /tmp/datasets_cache && \
70
  chmod 777 /tmp/torch && \
71
  chmod 777 /tmp/tensorflow && \
72
+ chmod 777 /tmp/keras && \
73
+ chmod 777 /tmp/accelerate_cache && \
74
+ chmod 777 /tmp/diffusers_cache && \
75
+ chmod 777 /tmp/models
76
 
77
  COPY requirements.txt ./
78
  COPY src/ ./src/
 
104
 
105
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
106
 
107
+ # Create a startup script to ensure environment variables are set
108
+ RUN echo '#!/bin/bash' > /app/start.sh && \
109
+ echo 'export TEMP_DIR=/tmp/docling_temp' >> /app/start.sh && \
110
+ echo 'export HOME=/tmp/docling_temp' >> /app/start.sh && \
111
+ echo 'export USERPROFILE=/tmp/docling_temp' >> /app/start.sh && \
112
+ echo 'export TMPDIR=/tmp/docling_temp' >> /app/start.sh && \
113
+ echo 'export TEMP=/tmp/docling_temp' >> /app/start.sh && \
114
+ echo 'export TMP=/tmp/docling_temp' >> /app/start.sh && \
115
+ echo 'export HF_HOME=/tmp/docling_temp/huggingface' >> /app/start.sh && \
116
+ echo 'export HF_CACHE_HOME=/tmp/docling_temp/huggingface_cache' >> /app/start.sh && \
117
+ echo 'export HF_HUB_CACHE=/tmp/docling_temp/huggingface_cache' >> /app/start.sh && \
118
+ echo 'export TRANSFORMERS_CACHE=/tmp/docling_temp/transformers_cache' >> /app/start.sh && \
119
+ echo 'export HF_DATASETS_CACHE=/tmp/docling_temp/datasets_cache' >> /app/start.sh && \
120
+ echo 'export DIFFUSERS_CACHE=/tmp/docling_temp/diffusers_cache' >> /app/start.sh && \
121
+ echo 'export ACCELERATE_CACHE=/tmp/docling_temp/accelerate_cache' >> /app/start.sh && \
122
+ echo 'export HF_HUB_DISABLE_TELEMETRY=1' >> /app/start.sh && \
123
+ echo 'export HF_HUB_DISABLE_IMPLICIT_TOKEN=1' >> /app/start.sh && \
124
+ echo 'export HF_HUB_OFFLINE=0' >> /app/start.sh && \
125
+ echo 'export TORCH_HOME=/tmp/docling_temp/torch' >> /app/start.sh && \
126
+ echo 'export TENSORFLOW_HOME=/tmp/docling_temp/tensorflow' >> /app/start.sh && \
127
+ echo 'export KERAS_HOME=/tmp/docling_temp/keras' >> /app/start.sh && \
128
+ echo 'export XDG_CACHE_HOME=/tmp/docling_temp/cache' >> /app/start.sh && \
129
+ echo 'export XDG_CONFIG_HOME=/tmp/docling_temp/config' >> /app/start.sh && \
130
+ echo 'export XDG_DATA_HOME=/tmp/docling_temp/data' >> /app/start.sh && \
131
+ echo 'export EASYOCR_MODULE_PATH=/tmp/docling_temp/easyocr_models' >> /app/start.sh && \
132
+ echo 'export CACHE_DIR=/tmp/docling_temp/cache' >> /app/start.sh && \
133
+ echo 'export MODEL_CACHE_DIR=/tmp/docling_temp/models' >> /app/start.sh && \
134
+ echo 'export CACHE=/tmp/docling_temp/cache' >> /app/start.sh && \
135
+ echo 'export MODELS=/tmp/docling_temp/models' >> /app/start.sh && \
136
+ echo 'export DATA=/tmp/docling_temp/data' >> /app/start.sh && \
137
+ echo 'export CONFIG=/tmp/docling_temp/config' >> /app/start.sh && \
138
+ echo 'export PYTHONPATH=/tmp/docling_temp' >> /app/start.sh && \
139
+ echo 'echo "Environment variables set for Hugging Face Hub cache directories"' >> /app/start.sh && \
140
+ echo 'echo "HF_HUB_CACHE: $HF_HUB_CACHE"' >> /app/start.sh && \
141
+ echo 'echo "HF_CACHE_HOME: $HF_CACHE_HOME"' >> /app/start.sh && \
142
+ echo 'echo "TEMP_DIR: $TEMP_DIR"' >> /app/start.sh && \
143
+ echo 'echo "Running environment test..."' >> /app/start.sh && \
144
+ echo 'python test_permissions.py' >> /app/start.sh && \
145
+ echo 'if [ $? -eq 0 ]; then' >> /app/start.sh && \
146
+ echo ' echo "Environment test passed, starting Streamlit app..."' >> /app/start.sh && \
147
+ echo ' exec streamlit run src/streamlit_app.py --server.port=8501 --server.address=0.0.0.0' >> /app/start.sh && \
148
+ echo 'else' >> /app/start.sh && \
149
+ echo ' echo "Environment test failed, exiting..."' >> /app/start.sh && \
150
+ echo ' exit 1' >> /app/start.sh && \
151
+ echo 'fi' >> /app/start.sh && \
152
+ chmod +x /app/start.sh
153
+
154
+ ENTRYPOINT ["/app/start.sh"]
README.md CHANGED
@@ -202,152 +202,112 @@ print(f"Removing {len(result['indices_to_remove'])} elements")
202
  4. Updates document structure
203
  5. Returns redacted JSON
204
 
205
- ### UI and Visualization Functions
206
 
207
- #### `create_diff_content(original_text: str, redacted_text: str, view_type: str) -> str`
208
- **Purpose**: Generates HTML content for side-by-side diff view.
209
 
210
- **Parameters**:
211
- - `original_text`: Original document content
212
- - `redacted_text`: Redacted document content
213
- - `view_type`: 'original' or 'redacted'
214
-
215
- **Returns**:
216
- - HTML string for diff display
217
-
218
- **Features**:
219
- - Text normalization (headers, quotes)
220
- - Synchronized scrolling
221
- - Color-coded highlighting
222
- - Git-style diff visualization
223
 
224
- #### `save_uploaded_file(uploaded_file, filename) -> str`
225
- **Purpose**: Safely saves uploaded files to temporary directory.
226
 
227
- **Parameters**:
228
- - `uploaded_file`: Streamlit uploaded file object
229
- - `filename`: Target filename
230
 
231
- **Returns**:
232
- - Path to saved temporary file
 
 
233
 
234
- **Features**:
235
- - File pointer reset handling
236
- - Temporary directory management
237
- - Error handling and logging
 
238
 
239
- ## πŸ”„ Sequence Diagram
 
 
 
240
 
 
241
  ```
242
- User Uploads PDF
243
- β”‚
244
- β–Ό
245
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
246
- β”‚ Streamlit App β”‚
247
- β”‚ - File Upload β”‚
248
- β”‚ - Validation β”‚
249
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
250
- β”‚
251
- β–Ό
252
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
253
- β”‚ save_uploaded_ β”‚
254
- β”‚ file() β”‚
255
- β”‚ - Reset pointer β”‚
256
- β”‚ - Save to temp β”‚
257
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
258
- β”‚
259
- β–Ό
260
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
261
- β”‚ DocumentProcessorβ”‚
262
- β”‚ .process() β”‚
263
- β”‚ - Docling conv β”‚
264
- β”‚ - Export JSON β”‚
265
- β”‚ - Export MD β”‚
266
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
267
- β”‚
268
- β–Ό
269
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
270
- β”‚ ReasoningSectionβ”‚
271
- β”‚ Extractor β”‚
272
- β”‚ .remove_sectionsβ”‚
273
- β”‚ _from_json() β”‚
274
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
275
- β”‚
276
- β–Ό
277
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
278
- β”‚ AzureO1Medicationβ”‚
279
- β”‚ Extractor β”‚
280
- β”‚ .extract_medicatβ”‚
281
- β”‚ ion_sections() β”‚
282
- β”‚ - API call β”‚
283
- β”‚ - JSON parsing β”‚
284
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
285
- β”‚
286
- β–Ό
287
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
288
- β”‚ DocumentProcessorβ”‚
289
- β”‚ _export_redactedβ”‚
290
- β”‚ _markdown() β”‚
291
- β”‚ - Filter texts β”‚
292
- β”‚ - Reconstruct MDβ”‚
293
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
294
- β”‚
295
- β–Ό
296
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
297
- β”‚ Streamlit App β”‚
298
- β”‚ - Store results β”‚
299
- β”‚ - Update UI β”‚
300
- β”‚ - Show diff β”‚
301
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
302
- β”‚
303
- β–Ό
304
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
305
- β”‚ create_diff_ β”‚
306
- β”‚ content() β”‚
307
- β”‚ - Normalize textβ”‚
308
- β”‚ - Generate HTML β”‚
309
- β”‚ - Sync scrollingβ”‚
310
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
311
  ```
312
 
313
- ## πŸš€ Setup and Installation
314
-
315
- ### Prerequisites
316
- - Python 3.11+
317
- - Azure OpenAI account with API access
318
- - Docling library
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  ### Environment Variables
321
- Create a `.env` file with:
322
- ```env
323
- AZURE_OPENAI_ENDPOINT=your_endpoint
324
- AZURE_OPENAI_KEY=your_api_key
325
- AZURE_OPENAI_VERSION=2024-12-01-preview
326
- AZURE_OPENAI_DEPLOYMENT=your_deployment_name
327
- ```
328
 
329
- ### Installation
330
  ```bash
331
- # Clone repository
332
- git clone <repository-url>
333
- cd docling
334
-
335
- # Install dependencies
336
- pip install -r requirements.txt
337
-
338
- # Run application
339
- streamlit run src/streamlit_app.py
 
 
 
 
 
 
 
 
 
 
340
  ```
341
 
342
- ## πŸ”§ Configuration
 
 
 
 
 
343
 
344
- ### Azure OpenAI Settings
345
- - **Model**: O1-mini (recommended for medication extraction)
346
- - **Max Tokens**: 100,000 (for large documents)
347
- - **Temperature**: 0 (for consistent results)
348
 
349
- ### Processing Settings
350
- - **Max Elements to Remove**: 10 (safety limit)
351
- - **Temp Directory**: `temp_files/` (auto-created)
352
- - **Log Level**: INFO (configurable)
353
 
 
202
  4. Updates document structure
203
  5. Returns redacted JSON
204
 
205
+ ## 🚨 Troubleshooting
206
 
207
+ ### Permission Error: `[Errno 13] Permission denied: '/.cache'`
 
208
 
209
+ **Problem**: When deploying to Hugging Face Spaces, you may encounter a permission error where the application tries to create cache directories in the root filesystem (`/.cache`).
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ **Root Cause**: Hugging Face Hub and other ML libraries try to create cache directories in the root filesystem by default, but containers in Hugging Face Spaces don't have permission to write to the root directory.
 
212
 
213
+ **Solution**: This application includes comprehensive environment variable configuration to redirect all cache directories to writable locations:
 
 
214
 
215
+ 1. **Environment Variables**: All cache directories are redirected to `/tmp/docling_temp/`
216
+ 2. **Lazy Initialization**: DocumentConverter is initialized lazily to ensure environment variables are set first
217
+ 3. **Startup Script**: Docker container uses a startup script that sets all necessary environment variables
218
+ 4. **Test Script**: `test_permissions.py` verifies the environment setup
219
 
220
+ **Files Modified**:
221
+ - `src/streamlit_app.py`: Environment variables set at the very beginning
222
+ - `src/processing/document_processor.py`: Lazy initialization of DocumentConverter
223
+ - `Dockerfile`: Environment variables and startup script
224
+ - `test_permissions.py`: Environment verification script
225
 
226
+ **Testing**: Run the test script to verify the environment:
227
+ ```bash
228
+ python test_permissions.py
229
+ ```
230
 
231
+ **Expected Output**:
232
  ```
233
+ βœ… ALL TESTS PASSED
234
+ πŸŽ‰ All tests passed! The environment is ready for Docling.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  ```
236
 
237
+ ### Other Common Issues
238
+
239
+ #### Memory Issues
240
+ - **Problem**: Large PDF files may cause memory issues
241
+ - **Solution**: The application includes automatic cleanup of temporary files and memory management
242
+
243
+ #### Azure OpenAI Configuration
244
+ - **Problem**: Missing or incorrect Azure OpenAI credentials
245
+ - **Solution**: Ensure `.env` file contains:
246
+ ```
247
+ AZURE_OPENAI_ENDPOINT=your_endpoint
248
+ AZURE_OPENAI_KEY=your_key
249
+ AZURE_OPENAI_VERSION=your_version
250
+ AZURE_OPENAI_DEPLOYMENT=your_deployment
251
+ ```
252
+
253
+ #### File Upload Issues
254
+ - **Problem**: Files not uploading or processing
255
+ - **Solution**: Check file size limits and ensure PDF format is supported
256
+
257
+ ## πŸ”§ Development and Deployment
258
+
259
+ ### Local Development
260
+ 1. Clone the repository
261
+ 2. Install dependencies: `pip install -r requirements.txt`
262
+ 3. Set up environment variables in `.env`
263
+ 4. Run the test script: `python test_permissions.py`
264
+ 5. Start the app: `streamlit run src/streamlit_app.py`
265
+
266
+ ### Hugging Face Spaces Deployment
267
+ 1. Push code to repository
268
+ 2. Ensure `Dockerfile` is present
269
+ 3. Set environment variables in Spaces settings
270
+ 4. Deploy and monitor logs for any issues
271
 
272
  ### Environment Variables
273
+ The application uses these environment variables to control cache directories:
 
 
 
 
 
 
274
 
 
275
  ```bash
276
+ # Core temp directory
277
+ TEMP_DIR=/tmp/docling_temp
278
+
279
+ # Hugging Face Hub
280
+ HF_HOME=/tmp/docling_temp/huggingface
281
+ HF_CACHE_HOME=/tmp/docling_temp/huggingface_cache
282
+ HF_HUB_CACHE=/tmp/docling_temp/huggingface_cache
283
+
284
+ # ML Libraries
285
+ TRANSFORMERS_CACHE=/tmp/docling_temp/transformers_cache
286
+ HF_DATASETS_CACHE=/tmp/docling_temp/datasets_cache
287
+ TORCH_HOME=/tmp/docling_temp/torch
288
+ TENSORFLOW_HOME=/tmp/docling_temp/tensorflow
289
+ KERAS_HOME=/tmp/docling_temp/keras
290
+
291
+ # XDG Directories
292
+ XDG_CACHE_HOME=/tmp/docling_temp/cache
293
+ XDG_CONFIG_HOME=/tmp/docling_temp/config
294
+ XDG_DATA_HOME=/tmp/docling_temp/data
295
  ```
296
 
297
+ ## πŸ“Š Performance and Monitoring
298
+
299
+ ### Memory Management
300
+ - Automatic cleanup of temporary files
301
+ - Session state management
302
+ - File size monitoring
303
 
304
+ ### Logging
305
+ - Comprehensive logging throughout the application
306
+ - In-memory log capture for UI display
307
+ - Error tracking and debugging information
308
 
309
+ ### Caching
310
+ - Hugging Face model caching in temp directories
311
+ - Document processing result caching
312
+ - Session state persistence
313
 
src/processing/document_processor.py CHANGED
@@ -4,11 +4,13 @@ import logging
4
  import json
5
  from dataclasses import dataclass
6
  from typing import Optional
7
- from docling.document_converter import DocumentConverter
 
 
8
  from processing.sections import SectionExtractor
9
 
10
- # Initialize a Docling converter globally (can be reused for multiple docs)
11
- _docling_converter = DocumentConverter()
12
 
13
  logger = logging.getLogger(__name__) # Logger for this module
14
 
@@ -27,16 +29,30 @@ class DocumentProcessor:
27
  """
28
  Initialize with an optional SectionExtractor for removing specific sections.
29
  If None, no redaction will be performed (original structure only).
30
- The Docling DocumentConverter is taken as a dependency (global or injected).
31
  """
32
  self.section_extractor = section_extractor
33
- # Allow dependency injection of converter if needed (use global by default)
34
- self.converter = _docling_converter
 
 
 
 
 
 
 
 
 
 
35
 
36
  def process(self, file_path: str) -> DocumentResult:
37
  """Parse the document and optionally remove specified sections. Returns a DocumentResult."""
38
  logger.info(f"Starting processing for file: {file_path}")
39
  start_time = time.time()
 
 
 
 
40
  # Convert the document using Docling
41
  conv_result = self.converter.convert(file_path)
42
  elapsed = time.time() - start_time
@@ -100,6 +116,28 @@ class DocumentProcessor:
100
  logger.info(f"Finished processing for file: {file_path}")
101
  return result
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def _export_redacted_markdown(self, document, redacted_json):
104
  """Export redacted markdown using the redacted JSON structure."""
105
  # Simply convert the redacted JSON back to markdown
 
4
  import json
5
  from dataclasses import dataclass
6
  from typing import Optional
7
+
8
+ # Don't import DocumentConverter at module level to prevent early initialization
9
+ # from docling.document_converter import DocumentConverter
10
  from processing.sections import SectionExtractor
11
 
12
+ # Remove global converter initialization - will be done lazily
13
+ # _docling_converter = DocumentConverter()
14
 
15
  logger = logging.getLogger(__name__) # Logger for this module
16
 
 
29
  """
30
  Initialize with an optional SectionExtractor for removing specific sections.
31
  If None, no redaction will be performed (original structure only).
32
+ The Docling DocumentConverter will be initialized lazily when needed.
33
  """
34
  self.section_extractor = section_extractor
35
+ self._converter = None # Lazy initialization
36
+
37
+ @property
38
+ def converter(self):
39
+ """Lazy initialization of DocumentConverter to prevent early Hugging Face Hub initialization."""
40
+ if self._converter is None:
41
+ # Import here to ensure environment variables are set first
42
+ from docling.document_converter import DocumentConverter
43
+ logger.info("Initializing Docling DocumentConverter...")
44
+ self._converter = DocumentConverter()
45
+ logger.info("Docling DocumentConverter initialized successfully")
46
+ return self._converter
47
 
48
  def process(self, file_path: str) -> DocumentResult:
49
  """Parse the document and optionally remove specified sections. Returns a DocumentResult."""
50
  logger.info(f"Starting processing for file: {file_path}")
51
  start_time = time.time()
52
+
53
+ # Ensure environment variables are set before processing
54
+ self._ensure_cache_directories()
55
+
56
  # Convert the document using Docling
57
  conv_result = self.converter.convert(file_path)
58
  elapsed = time.time() - start_time
 
116
  logger.info(f"Finished processing for file: {file_path}")
117
  return result
118
 
119
+ def _ensure_cache_directories(self):
120
+ """Ensure all necessary cache directories exist before processing."""
121
+ cache_dirs = [
122
+ os.environ.get('HF_HOME', '/tmp/docling_temp/huggingface'),
123
+ os.environ.get('HF_CACHE_HOME', '/tmp/docling_temp/huggingface_cache'),
124
+ os.environ.get('HF_HUB_CACHE', '/tmp/docling_temp/huggingface_cache'),
125
+ os.environ.get('TRANSFORMERS_CACHE', '/tmp/docling_temp/transformers_cache'),
126
+ os.environ.get('HF_DATASETS_CACHE', '/tmp/docling_temp/datasets_cache'),
127
+ os.environ.get('DIFFUSERS_CACHE', '/tmp/docling_temp/diffusers_cache'),
128
+ os.environ.get('ACCELERATE_CACHE', '/tmp/docling_temp/accelerate_cache'),
129
+ os.environ.get('TORCH_HOME', '/tmp/docling_temp/torch'),
130
+ os.environ.get('TENSORFLOW_HOME', '/tmp/docling_temp/tensorflow'),
131
+ os.environ.get('KERAS_HOME', '/tmp/docling_temp/keras'),
132
+ ]
133
+
134
+ for cache_dir in cache_dirs:
135
+ try:
136
+ os.makedirs(cache_dir, exist_ok=True)
137
+ logger.debug(f"Ensured cache directory exists: {cache_dir}")
138
+ except Exception as e:
139
+ logger.warning(f"Could not create cache directory {cache_dir}: {e}")
140
+
141
  def _export_redacted_markdown(self, document, redacted_json):
142
  """Export redacted markdown using the redacted JSON structure."""
143
  # Simply convert the redacted JSON back to markdown
src/streamlit_app.py CHANGED
@@ -1,21 +1,9 @@
1
- import streamlit as st
2
- import logging
3
- import os
4
- import tempfile
5
- import shutil
6
- from processing.document_processor import DocumentProcessor
7
- from processing.sections import ReasoningSectionExtractor
8
- from utils.logging_utils import get_log_handler
9
- from dotenv import load_dotenv
10
- import sys
11
- import html
12
- import difflib
13
- import re
14
- import time
15
-
16
  # Set environment variables IMMEDIATELY to prevent root filesystem access
17
  # This must happen before any other imports or operations
18
 
 
 
 
19
  # Get a writable temp directory first
20
  try:
21
  TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
@@ -44,11 +32,19 @@ os.environ.update({
44
  'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
45
  'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),
46
 
47
- # Hugging Face Hub configuration
48
  'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
49
  'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
 
50
  'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
51
  'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
 
 
 
 
 
 
 
52
 
53
  # Other ML libraries
54
  'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
@@ -69,13 +65,6 @@ os.environ.update({
69
  'MODELS': os.path.join(TEMP_DIR, 'models'),
70
  'DATA': os.path.join(TEMP_DIR, 'data'),
71
  'CONFIG': os.path.join(TEMP_DIR, 'config'),
72
-
73
- # Specific cache overrides
74
- 'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
75
- 'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
76
- 'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
77
- 'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
78
- 'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),
79
  })
80
 
81
  # Create all necessary directories
@@ -125,6 +114,20 @@ for directory in directories_to_create:
125
  except Exception as e:
126
  print(f"Warning: Could not create directory {directory}: {e}")
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # Configure logging early to avoid issues
129
  logging.basicConfig(
130
  level=logging.INFO,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Set environment variables IMMEDIATELY to prevent root filesystem access
2
  # This must happen before any other imports or operations
3
 
4
+ import os
5
+ import tempfile
6
+
7
  # Get a writable temp directory first
8
  try:
9
  TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
 
32
  'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
33
  'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),
34
 
35
+ # Hugging Face Hub configuration - CRITICAL for preventing /.cache access
36
  'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
37
  'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
38
+ 'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
39
  'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
40
  'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
41
+ 'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
42
+ 'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),
43
+
44
+ # Additional Hugging Face specific variables
45
+ 'HF_HUB_DISABLE_TELEMETRY': '1',
46
+ 'HF_HUB_DISABLE_IMPLICIT_TOKEN': '1',
47
+ 'HF_HUB_OFFLINE': '0',
48
 
49
  # Other ML libraries
50
  'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
 
65
  'MODELS': os.path.join(TEMP_DIR, 'models'),
66
  'DATA': os.path.join(TEMP_DIR, 'data'),
67
  'CONFIG': os.path.join(TEMP_DIR, 'config'),
 
 
 
 
 
 
 
68
  })
69
 
70
  # Create all necessary directories
 
114
  except Exception as e:
115
  print(f"Warning: Could not create directory {directory}: {e}")
116
 
117
+ # Now import the rest of the modules
118
+ import streamlit as st
119
+ import logging
120
+ import shutil
121
+ from processing.document_processor import DocumentProcessor
122
+ from processing.sections import ReasoningSectionExtractor
123
+ from utils.logging_utils import get_log_handler
124
+ from dotenv import load_dotenv
125
+ import sys
126
+ import html
127
+ import difflib
128
+ import re
129
+ import time
130
+
131
  # Configure logging early to avoid issues
132
  logging.basicConfig(
133
  level=logging.INFO,
test_permissions.py CHANGED
@@ -1,80 +1,174 @@
1
  #!/usr/bin/env python3
2
  """
3
- Test script to check permissions and directory creation in Hugging Face environment.
 
4
  """
5
 
6
  import os
7
  import tempfile
8
- import logging
9
 
10
- def test_permissions():
11
- """Test if we can create directories and files in various locations."""
12
- print("=== Testing Permissions ===")
 
 
13
 
14
- # Test 1: System temp directory
15
- print("\n1. Testing system temp directory...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  try:
17
- temp_dir = os.path.join(tempfile.gettempdir(), "docling_test")
18
  os.makedirs(temp_dir, exist_ok=True)
19
- test_file = os.path.join(temp_dir, "test.txt")
20
- with open(test_file, "w") as f:
21
- f.write("test")
22
  os.remove(test_file)
23
- os.rmdir(temp_dir)
24
- print(f"βœ… Success: {temp_dir}")
25
  except Exception as e:
26
- print(f"❌ Failed: {e}")
 
 
 
 
 
 
27
 
28
- # Test 2: Current working directory
29
- print("\n2. Testing current working directory...")
30
- try:
31
- cwd = os.getcwd()
32
- test_dir = os.path.join(cwd, "test_temp")
33
- os.makedirs(test_dir, exist_ok=True)
34
- test_file = os.path.join(test_dir, "test.txt")
35
- with open(test_file, "w") as f:
36
- f.write("test")
37
- os.remove(test_file)
38
- os.rmdir(test_dir)
39
- print(f"βœ… Success: {test_dir}")
40
- except Exception as e:
41
- print(f"❌ Failed: {e}")
 
 
 
 
 
 
 
 
 
 
42
 
43
- # Test 3: /tmp directory
44
- print("\n3. Testing /tmp directory...")
45
- try:
46
- tmp_dir = "/tmp/docling_test"
47
- os.makedirs(tmp_dir, exist_ok=True)
48
- test_file = os.path.join(tmp_dir, "test.txt")
49
- with open(test_file, "w") as f:
50
- f.write("test")
51
- os.remove(test_file)
52
- os.rmdir(tmp_dir)
53
- print(f"βœ… Success: {tmp_dir}")
54
- except Exception as e:
55
- print(f"❌ Failed: {e}")
56
 
57
- # Test 4: Environment variables
58
- print("\n4. Testing environment variables...")
59
- env_vars = [
60
- 'STREAMLIT_SERVER_FILE_WATCHER_TYPE',
61
- 'STREAMLIT_SERVER_HEADLESS',
62
- 'STREAMLIT_BROWSER_GATHER_USAGE_STATS'
63
- ]
64
- for var in env_vars:
65
- value = os.environ.get(var, 'NOT_SET')
66
- print(f" {var}: {value}")
67
 
68
- # Test 5: Current directory permissions
69
- print("\n5. Testing current directory permissions...")
70
- try:
71
- cwd = os.getcwd()
72
- print(f" Current directory: {cwd}")
73
- print(f" Readable: {os.access(cwd, os.R_OK)}")
74
- print(f" Writable: {os.access(cwd, os.W_OK)}")
75
- print(f" Executable: {os.access(cwd, os.X_OK)}")
76
- except Exception as e:
77
- print(f"❌ Failed: {e}")
78
 
79
  if __name__ == "__main__":
80
- test_permissions()
 
1
  #!/usr/bin/env python3
2
  """
3
+ Test script to verify environment variables and cache directory permissions.
4
+ This should be run before the main application to ensure everything is set up correctly.
5
  """
6
 
7
  import os
8
  import tempfile
9
+ import sys
10
 
11
+ def test_environment_setup():
12
+ """Test that environment variables are set correctly."""
13
+ print("=" * 60)
14
+ print("Testing Environment Setup")
15
+ print("=" * 60)
16
 
17
+ # Check critical environment variables
18
+ critical_vars = [
19
+ 'HF_HOME',
20
+ 'HF_CACHE_HOME',
21
+ 'HF_HUB_CACHE',
22
+ 'TRANSFORMERS_CACHE',
23
+ 'HF_DATASETS_CACHE',
24
+ 'TEMP_DIR',
25
+ 'HOME',
26
+ 'TMPDIR'
27
+ ]
28
+
29
+ all_good = True
30
+ for var in critical_vars:
31
+ value = os.environ.get(var)
32
+ if value:
33
+ print(f"βœ… {var}: {value}")
34
+ else:
35
+ print(f"❌ {var}: NOT SET")
36
+ all_good = False
37
+
38
+ return all_good
39
+
40
+ def test_cache_directories():
41
+ """Test that cache directories can be created and accessed."""
42
+ print("\n" + "=" * 60)
43
+ print("Testing Cache Directory Access")
44
+ print("=" * 60)
45
+
46
+ cache_dirs = [
47
+ os.environ.get('HF_HOME', '/tmp/docling_temp/huggingface'),
48
+ os.environ.get('HF_CACHE_HOME', '/tmp/docling_temp/huggingface_cache'),
49
+ os.environ.get('HF_HUB_CACHE', '/tmp/docling_temp/huggingface_cache'),
50
+ os.environ.get('TRANSFORMERS_CACHE', '/tmp/docling_temp/transformers_cache'),
51
+ os.environ.get('HF_DATASETS_CACHE', '/tmp/docling_temp/datasets_cache'),
52
+ os.environ.get('TORCH_HOME', '/tmp/docling_temp/torch'),
53
+ os.environ.get('TENSORFLOW_HOME', '/tmp/docling_temp/tensorflow'),
54
+ os.environ.get('KERAS_HOME', '/tmp/docling_temp/keras'),
55
+ ]
56
+
57
+ all_good = True
58
+ for cache_dir in cache_dirs:
59
+ try:
60
+ os.makedirs(cache_dir, exist_ok=True)
61
+ # Test writing a file
62
+ test_file = os.path.join(cache_dir, 'test_write.txt')
63
+ with open(test_file, 'w') as f:
64
+ f.write('test')
65
+ os.remove(test_file)
66
+ print(f"βœ… {cache_dir}: WRITABLE")
67
+ except Exception as e:
68
+ print(f"❌ {cache_dir}: ERROR - {e}")
69
+ all_good = False
70
+
71
+ return all_good
72
+
73
+ def test_root_filesystem_access():
74
+ """Test that we cannot access root filesystem."""
75
+ print("\n" + "=" * 60)
76
+ print("Testing Root Filesystem Access Prevention")
77
+ print("=" * 60)
78
+
79
+ root_paths = [
80
+ '/.cache',
81
+ '/root',
82
+ '/etc/test',
83
+ '/var/test'
84
+ ]
85
+
86
+ all_good = True
87
+ for path in root_paths:
88
+ try:
89
+ os.makedirs(path, exist_ok=True)
90
+ print(f"❌ {path}: SUCCESSFULLY CREATED (SHOULD FAIL)")
91
+ all_good = False
92
+ except PermissionError:
93
+ print(f"βœ… {path}: PERMISSION DENIED (GOOD)")
94
+ except Exception as e:
95
+ print(f"⚠️ {path}: OTHER ERROR - {e}")
96
+
97
+ return all_good
98
+
99
+ def test_temp_directory():
100
+ """Test temp directory access."""
101
+ print("\n" + "=" * 60)
102
+ print("Testing Temp Directory Access")
103
+ print("=" * 60)
104
+
105
+ temp_dir = os.environ.get('TEMP_DIR', '/tmp/docling_temp')
106
  try:
 
107
  os.makedirs(temp_dir, exist_ok=True)
108
+ test_file = os.path.join(temp_dir, 'test_temp.txt')
109
+ with open(test_file, 'w') as f:
110
+ f.write('temp test')
111
  os.remove(test_file)
112
+ print(f"βœ… {temp_dir}: WRITABLE")
113
+ return True
114
  except Exception as e:
115
+ print(f"❌ {temp_dir}: ERROR - {e}")
116
+ return False
117
+
118
+ def main():
119
+ """Run all tests."""
120
+ print("Docling Environment and Permission Test")
121
+ print("This script tests that the environment is set up correctly for Hugging Face Spaces")
122
 
123
+ # Set environment variables if not already set
124
+ if not os.environ.get('TEMP_DIR'):
125
+ temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp")
126
+ os.environ.update({
127
+ 'TEMP_DIR': temp_dir,
128
+ 'HOME': temp_dir,
129
+ 'USERPROFILE': temp_dir,
130
+ 'TMPDIR': temp_dir,
131
+ 'TEMP': temp_dir,
132
+ 'TMP': temp_dir,
133
+ 'HF_HOME': os.path.join(temp_dir, 'huggingface'),
134
+ 'HF_CACHE_HOME': os.path.join(temp_dir, 'huggingface_cache'),
135
+ 'HF_HUB_CACHE': os.path.join(temp_dir, 'huggingface_cache'),
136
+ 'TRANSFORMERS_CACHE': os.path.join(temp_dir, 'transformers_cache'),
137
+ 'HF_DATASETS_CACHE': os.path.join(temp_dir, 'datasets_cache'),
138
+ 'DIFFUSERS_CACHE': os.path.join(temp_dir, 'diffusers_cache'),
139
+ 'ACCELERATE_CACHE': os.path.join(temp_dir, 'accelerate_cache'),
140
+ 'TORCH_HOME': os.path.join(temp_dir, 'torch'),
141
+ 'TENSORFLOW_HOME': os.path.join(temp_dir, 'tensorflow'),
142
+ 'KERAS_HOME': os.path.join(temp_dir, 'keras'),
143
+ 'XDG_CACHE_HOME': os.path.join(temp_dir, 'cache'),
144
+ 'XDG_CONFIG_HOME': os.path.join(temp_dir, 'config'),
145
+ 'XDG_DATA_HOME': os.path.join(temp_dir, 'data'),
146
+ })
147
 
148
+ # Run tests
149
+ env_ok = test_environment_setup()
150
+ cache_ok = test_cache_directories()
151
+ root_ok = test_root_filesystem_access()
152
+ temp_ok = test_temp_directory()
 
 
 
 
 
 
 
 
153
 
154
+ # Summary
155
+ print("\n" + "=" * 60)
156
+ print("TEST SUMMARY")
157
+ print("=" * 60)
158
+ print(f"Environment Variables: {'βœ… PASS' if env_ok else '❌ FAIL'}")
159
+ print(f"Cache Directories: {'βœ… PASS' if cache_ok else '❌ FAIL'}")
160
+ print(f"Root Access Prevention: {'βœ… PASS' if root_ok else '❌ FAIL'}")
161
+ print(f"Temp Directory: {'βœ… PASS' if temp_ok else '❌ FAIL'}")
 
 
162
 
163
+ overall_success = env_ok and cache_ok and root_ok and temp_ok
164
+ print(f"\nOverall Result: {'βœ… ALL TESTS PASSED' if overall_success else '❌ SOME TESTS FAILED'}")
165
+
166
+ if not overall_success:
167
+ print("\n⚠️ Some tests failed. Please check the environment setup.")
168
+ sys.exit(1)
169
+ else:
170
+ print("\nπŸŽ‰ All tests passed! The environment is ready for Docling.")
171
+ sys.exit(0)
 
172
 
173
  if __name__ == "__main__":
174
+ main()