Spaces:
Sleeping
Sleeping
Commit
Β·
98aae70
1
Parent(s):
3db2fae
Enhance Dockerfile and Streamlit app for comprehensive environment setup and permission testing
Browse files- Implemented extensive environment variable configurations to redirect cache directories for Hugging Face and other ML libraries to writable locations.
- Added a startup script in the Dockerfile to ensure environment variables are set before the application runs.
- Updated the Streamlit app to configure environment variables immediately and perform necessary directory creation with error handling.
- Introduced a test script to verify environment setup and permissions, ensuring the application runs smoothly in Hugging Face Spaces.
- Dockerfile +99 -3
- README.md +89 -129
- src/processing/document_processor.py +44 -6
- src/streamlit_app.py +26 -23
- test_permissions.py +156 -62
Dockerfile
CHANGED
@@ -2,6 +2,52 @@ FROM python:3.9-slim
|
|
2 |
|
3 |
WORKDIR /app
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
RUN apt-get update && apt-get install -y \
|
6 |
build-essential \
|
7 |
curl \
|
@@ -10,7 +56,7 @@ RUN apt-get update && apt-get install -y \
|
|
10 |
&& rm -rf /var/lib/apt/lists/*
|
11 |
|
12 |
# Create necessary directories with proper permissions
|
13 |
-
RUN mkdir -p /app/.streamlit /tmp/docling_temp /tmp/easyocr_models /tmp/cache /tmp/config /tmp/data /tmp/huggingface /tmp/huggingface_cache /tmp/transformers_cache /tmp/datasets_cache /tmp/torch /tmp/tensorflow /tmp/keras && \
|
14 |
chmod 755 /app/.streamlit && \
|
15 |
chmod 777 /tmp/docling_temp && \
|
16 |
chmod 777 /tmp/easyocr_models && \
|
@@ -23,7 +69,10 @@ RUN mkdir -p /app/.streamlit /tmp/docling_temp /tmp/easyocr_models /tmp/cache /t
|
|
23 |
chmod 777 /tmp/datasets_cache && \
|
24 |
chmod 777 /tmp/torch && \
|
25 |
chmod 777 /tmp/tensorflow && \
|
26 |
-
chmod 777 /tmp/keras
|
|
|
|
|
|
|
27 |
|
28 |
COPY requirements.txt ./
|
29 |
COPY src/ ./src/
|
@@ -55,4 +104,51 @@ EXPOSE 8501
|
|
55 |
|
56 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
57 |
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
WORKDIR /app
|
4 |
|
5 |
+
# Set environment variables to prevent root filesystem access
|
6 |
+
ENV TEMP_DIR=/tmp/docling_temp
|
7 |
+
ENV HOME=/tmp/docling_temp
|
8 |
+
ENV USERPROFILE=/tmp/docling_temp
|
9 |
+
ENV TMPDIR=/tmp/docling_temp
|
10 |
+
ENV TEMP=/tmp/docling_temp
|
11 |
+
ENV TMP=/tmp/docling_temp
|
12 |
+
|
13 |
+
# Hugging Face Hub configuration - CRITICAL for preventing /.cache access
|
14 |
+
ENV HF_HOME=/tmp/docling_temp/huggingface
|
15 |
+
ENV HF_CACHE_HOME=/tmp/docling_temp/huggingface_cache
|
16 |
+
ENV HF_HUB_CACHE=/tmp/docling_temp/huggingface_cache
|
17 |
+
ENV TRANSFORMERS_CACHE=/tmp/docling_temp/transformers_cache
|
18 |
+
ENV HF_DATASETS_CACHE=/tmp/docling_temp/datasets_cache
|
19 |
+
ENV DIFFUSERS_CACHE=/tmp/docling_temp/diffusers_cache
|
20 |
+
ENV ACCELERATE_CACHE=/tmp/docling_temp/accelerate_cache
|
21 |
+
|
22 |
+
# Additional Hugging Face specific variables
|
23 |
+
ENV HF_HUB_DISABLE_TELEMETRY=1
|
24 |
+
ENV HF_HUB_DISABLE_IMPLICIT_TOKEN=1
|
25 |
+
ENV HF_HUB_OFFLINE=0
|
26 |
+
|
27 |
+
# Other ML libraries
|
28 |
+
ENV TORCH_HOME=/tmp/docling_temp/torch
|
29 |
+
ENV TENSORFLOW_HOME=/tmp/docling_temp/tensorflow
|
30 |
+
ENV KERAS_HOME=/tmp/docling_temp/keras
|
31 |
+
|
32 |
+
# XDG directories
|
33 |
+
ENV XDG_CACHE_HOME=/tmp/docling_temp/cache
|
34 |
+
ENV XDG_CONFIG_HOME=/tmp/docling_temp/config
|
35 |
+
ENV XDG_DATA_HOME=/tmp/docling_temp/data
|
36 |
+
|
37 |
+
# EasyOCR configuration
|
38 |
+
ENV EASYOCR_MODULE_PATH=/tmp/docling_temp/easyocr_models
|
39 |
+
|
40 |
+
# Additional cache directories
|
41 |
+
ENV CACHE_DIR=/tmp/docling_temp/cache
|
42 |
+
ENV MODEL_CACHE_DIR=/tmp/docling_temp/models
|
43 |
+
ENV CACHE=/tmp/docling_temp/cache
|
44 |
+
ENV MODELS=/tmp/docling_temp/models
|
45 |
+
ENV DATA=/tmp/docling_temp/data
|
46 |
+
ENV CONFIG=/tmp/docling_temp/config
|
47 |
+
|
48 |
+
# Python path
|
49 |
+
ENV PYTHONPATH=/tmp/docling_temp
|
50 |
+
|
51 |
RUN apt-get update && apt-get install -y \
|
52 |
build-essential \
|
53 |
curl \
|
|
|
56 |
&& rm -rf /var/lib/apt/lists/*
|
57 |
|
58 |
# Create necessary directories with proper permissions
|
59 |
+
RUN mkdir -p /app/.streamlit /tmp/docling_temp /tmp/easyocr_models /tmp/cache /tmp/config /tmp/data /tmp/huggingface /tmp/huggingface_cache /tmp/transformers_cache /tmp/datasets_cache /tmp/torch /tmp/tensorflow /tmp/keras /tmp/accelerate_cache /tmp/diffusers_cache /tmp/models && \
|
60 |
chmod 755 /app/.streamlit && \
|
61 |
chmod 777 /tmp/docling_temp && \
|
62 |
chmod 777 /tmp/easyocr_models && \
|
|
|
69 |
chmod 777 /tmp/datasets_cache && \
|
70 |
chmod 777 /tmp/torch && \
|
71 |
chmod 777 /tmp/tensorflow && \
|
72 |
+
chmod 777 /tmp/keras && \
|
73 |
+
chmod 777 /tmp/accelerate_cache && \
|
74 |
+
chmod 777 /tmp/diffusers_cache && \
|
75 |
+
chmod 777 /tmp/models
|
76 |
|
77 |
COPY requirements.txt ./
|
78 |
COPY src/ ./src/
|
|
|
104 |
|
105 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
106 |
|
107 |
+
# Create a startup script to ensure environment variables are set
|
108 |
+
RUN echo '#!/bin/bash' > /app/start.sh && \
|
109 |
+
echo 'export TEMP_DIR=/tmp/docling_temp' >> /app/start.sh && \
|
110 |
+
echo 'export HOME=/tmp/docling_temp' >> /app/start.sh && \
|
111 |
+
echo 'export USERPROFILE=/tmp/docling_temp' >> /app/start.sh && \
|
112 |
+
echo 'export TMPDIR=/tmp/docling_temp' >> /app/start.sh && \
|
113 |
+
echo 'export TEMP=/tmp/docling_temp' >> /app/start.sh && \
|
114 |
+
echo 'export TMP=/tmp/docling_temp' >> /app/start.sh && \
|
115 |
+
echo 'export HF_HOME=/tmp/docling_temp/huggingface' >> /app/start.sh && \
|
116 |
+
echo 'export HF_CACHE_HOME=/tmp/docling_temp/huggingface_cache' >> /app/start.sh && \
|
117 |
+
echo 'export HF_HUB_CACHE=/tmp/docling_temp/huggingface_cache' >> /app/start.sh && \
|
118 |
+
echo 'export TRANSFORMERS_CACHE=/tmp/docling_temp/transformers_cache' >> /app/start.sh && \
|
119 |
+
echo 'export HF_DATASETS_CACHE=/tmp/docling_temp/datasets_cache' >> /app/start.sh && \
|
120 |
+
echo 'export DIFFUSERS_CACHE=/tmp/docling_temp/diffusers_cache' >> /app/start.sh && \
|
121 |
+
echo 'export ACCELERATE_CACHE=/tmp/docling_temp/accelerate_cache' >> /app/start.sh && \
|
122 |
+
echo 'export HF_HUB_DISABLE_TELEMETRY=1' >> /app/start.sh && \
|
123 |
+
echo 'export HF_HUB_DISABLE_IMPLICIT_TOKEN=1' >> /app/start.sh && \
|
124 |
+
echo 'export HF_HUB_OFFLINE=0' >> /app/start.sh && \
|
125 |
+
echo 'export TORCH_HOME=/tmp/docling_temp/torch' >> /app/start.sh && \
|
126 |
+
echo 'export TENSORFLOW_HOME=/tmp/docling_temp/tensorflow' >> /app/start.sh && \
|
127 |
+
echo 'export KERAS_HOME=/tmp/docling_temp/keras' >> /app/start.sh && \
|
128 |
+
echo 'export XDG_CACHE_HOME=/tmp/docling_temp/cache' >> /app/start.sh && \
|
129 |
+
echo 'export XDG_CONFIG_HOME=/tmp/docling_temp/config' >> /app/start.sh && \
|
130 |
+
echo 'export XDG_DATA_HOME=/tmp/docling_temp/data' >> /app/start.sh && \
|
131 |
+
echo 'export EASYOCR_MODULE_PATH=/tmp/docling_temp/easyocr_models' >> /app/start.sh && \
|
132 |
+
echo 'export CACHE_DIR=/tmp/docling_temp/cache' >> /app/start.sh && \
|
133 |
+
echo 'export MODEL_CACHE_DIR=/tmp/docling_temp/models' >> /app/start.sh && \
|
134 |
+
echo 'export CACHE=/tmp/docling_temp/cache' >> /app/start.sh && \
|
135 |
+
echo 'export MODELS=/tmp/docling_temp/models' >> /app/start.sh && \
|
136 |
+
echo 'export DATA=/tmp/docling_temp/data' >> /app/start.sh && \
|
137 |
+
echo 'export CONFIG=/tmp/docling_temp/config' >> /app/start.sh && \
|
138 |
+
echo 'export PYTHONPATH=/tmp/docling_temp' >> /app/start.sh && \
|
139 |
+
echo 'echo "Environment variables set for Hugging Face Hub cache directories"' >> /app/start.sh && \
|
140 |
+
echo 'echo "HF_HUB_CACHE: $HF_HUB_CACHE"' >> /app/start.sh && \
|
141 |
+
echo 'echo "HF_CACHE_HOME: $HF_CACHE_HOME"' >> /app/start.sh && \
|
142 |
+
echo 'echo "TEMP_DIR: $TEMP_DIR"' >> /app/start.sh && \
|
143 |
+
echo 'echo "Running environment test..."' >> /app/start.sh && \
|
144 |
+
echo 'python test_permissions.py' >> /app/start.sh && \
|
145 |
+
echo 'if [ $? -eq 0 ]; then' >> /app/start.sh && \
|
146 |
+
echo ' echo "Environment test passed, starting Streamlit app..."' >> /app/start.sh && \
|
147 |
+
echo ' exec streamlit run src/streamlit_app.py --server.port=8501 --server.address=0.0.0.0' >> /app/start.sh && \
|
148 |
+
echo 'else' >> /app/start.sh && \
|
149 |
+
echo ' echo "Environment test failed, exiting..."' >> /app/start.sh && \
|
150 |
+
echo ' exit 1' >> /app/start.sh && \
|
151 |
+
echo 'fi' >> /app/start.sh && \
|
152 |
+
chmod +x /app/start.sh
|
153 |
+
|
154 |
+
ENTRYPOINT ["/app/start.sh"]
|
README.md
CHANGED
@@ -202,152 +202,112 @@ print(f"Removing {len(result['indices_to_remove'])} elements")
|
|
202 |
4. Updates document structure
|
203 |
5. Returns redacted JSON
|
204 |
|
205 |
-
|
206 |
|
207 |
-
|
208 |
-
**Purpose**: Generates HTML content for side-by-side diff view.
|
209 |
|
210 |
-
**
|
211 |
-
- `original_text`: Original document content
|
212 |
-
- `redacted_text`: Redacted document content
|
213 |
-
- `view_type`: 'original' or 'redacted'
|
214 |
-
|
215 |
-
**Returns**:
|
216 |
-
- HTML string for diff display
|
217 |
-
|
218 |
-
**Features**:
|
219 |
-
- Text normalization (headers, quotes)
|
220 |
-
- Synchronized scrolling
|
221 |
-
- Color-coded highlighting
|
222 |
-
- Git-style diff visualization
|
223 |
|
224 |
-
|
225 |
-
**Purpose**: Safely saves uploaded files to temporary directory.
|
226 |
|
227 |
-
**
|
228 |
-
- `uploaded_file`: Streamlit uploaded file object
|
229 |
-
- `filename`: Target filename
|
230 |
|
231 |
-
**
|
232 |
-
|
|
|
|
|
233 |
|
234 |
-
**
|
235 |
-
-
|
236 |
-
-
|
237 |
-
-
|
|
|
238 |
|
239 |
-
|
|
|
|
|
|
|
240 |
|
|
|
241 |
```
|
242 |
-
|
243 |
-
|
244 |
-
βΌ
|
245 |
-
βββββββββββββββββββ
|
246 |
-
β Streamlit App β
|
247 |
-
β - File Upload β
|
248 |
-
β - Validation β
|
249 |
-
βββββββββββββββββββ
|
250 |
-
β
|
251 |
-
βΌ
|
252 |
-
βββββββββββββββββββ
|
253 |
-
β save_uploaded_ β
|
254 |
-
β file() β
|
255 |
-
β - Reset pointer β
|
256 |
-
β - Save to temp β
|
257 |
-
βββββββββββββββββββ
|
258 |
-
β
|
259 |
-
βΌ
|
260 |
-
βββββββββββββββββββ
|
261 |
-
β DocumentProcessorβ
|
262 |
-
β .process() β
|
263 |
-
β - Docling conv β
|
264 |
-
β - Export JSON β
|
265 |
-
β - Export MD β
|
266 |
-
βββββββββββββββββββ
|
267 |
-
β
|
268 |
-
βΌ
|
269 |
-
βββββββββββββββββββ
|
270 |
-
β ReasoningSectionβ
|
271 |
-
β Extractor β
|
272 |
-
β .remove_sectionsβ
|
273 |
-
β _from_json() β
|
274 |
-
βββββββββββββββββββ
|
275 |
-
β
|
276 |
-
βΌ
|
277 |
-
βββββββββββββββββββ
|
278 |
-
β AzureO1Medicationβ
|
279 |
-
β Extractor β
|
280 |
-
β .extract_medicatβ
|
281 |
-
β ion_sections() β
|
282 |
-
β - API call β
|
283 |
-
β - JSON parsing β
|
284 |
-
βββββββββββββββββββ
|
285 |
-
β
|
286 |
-
βΌ
|
287 |
-
βββββββββββββββββββ
|
288 |
-
β DocumentProcessorβ
|
289 |
-
β _export_redactedβ
|
290 |
-
β _markdown() β
|
291 |
-
β - Filter texts β
|
292 |
-
β - Reconstruct MDβ
|
293 |
-
βββββββββββββββββββ
|
294 |
-
β
|
295 |
-
βΌ
|
296 |
-
βββββββββββββββββββ
|
297 |
-
β Streamlit App β
|
298 |
-
β - Store results β
|
299 |
-
β - Update UI β
|
300 |
-
β - Show diff β
|
301 |
-
βββββββββββββββββββ
|
302 |
-
β
|
303 |
-
βΌ
|
304 |
-
βββββββββββββββββββ
|
305 |
-
β create_diff_ β
|
306 |
-
β content() β
|
307 |
-
β - Normalize textβ
|
308 |
-
β - Generate HTML β
|
309 |
-
β - Sync scrollingβ
|
310 |
-
βββββββββββββββββββ
|
311 |
```
|
312 |
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
-
|
317 |
-
-
|
318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
|
320 |
### Environment Variables
|
321 |
-
|
322 |
-
```env
|
323 |
-
AZURE_OPENAI_ENDPOINT=your_endpoint
|
324 |
-
AZURE_OPENAI_KEY=your_api_key
|
325 |
-
AZURE_OPENAI_VERSION=2024-12-01-preview
|
326 |
-
AZURE_OPENAI_DEPLOYMENT=your_deployment_name
|
327 |
-
```
|
328 |
|
329 |
-
### Installation
|
330 |
```bash
|
331 |
-
#
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
```
|
341 |
|
342 |
-
##
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
-
###
|
345 |
-
-
|
346 |
-
-
|
347 |
-
-
|
348 |
|
349 |
-
###
|
350 |
-
-
|
351 |
-
-
|
352 |
-
-
|
353 |
|
|
|
202 |
4. Updates document structure
|
203 |
5. Returns redacted JSON
|
204 |
|
205 |
+
## π¨ Troubleshooting
|
206 |
|
207 |
+
### Permission Error: `[Errno 13] Permission denied: '/.cache'`
|
|
|
208 |
|
209 |
+
**Problem**: When deploying to Hugging Face Spaces, you may encounter a permission error where the application tries to create cache directories in the root filesystem (`/.cache`).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
+
**Root Cause**: Hugging Face Hub and other ML libraries try to create cache directories in the root filesystem by default, but containers in Hugging Face Spaces don't have permission to write to the root directory.
|
|
|
212 |
|
213 |
+
**Solution**: This application includes comprehensive environment variable configuration to redirect all cache directories to writable locations:
|
|
|
|
|
214 |
|
215 |
+
1. **Environment Variables**: All cache directories are redirected to `/tmp/docling_temp/`
|
216 |
+
2. **Lazy Initialization**: DocumentConverter is initialized lazily to ensure environment variables are set first
|
217 |
+
3. **Startup Script**: Docker container uses a startup script that sets all necessary environment variables
|
218 |
+
4. **Test Script**: `test_permissions.py` verifies the environment setup
|
219 |
|
220 |
+
**Files Modified**:
|
221 |
+
- `src/streamlit_app.py`: Environment variables set at the very beginning
|
222 |
+
- `src/processing/document_processor.py`: Lazy initialization of DocumentConverter
|
223 |
+
- `Dockerfile`: Environment variables and startup script
|
224 |
+
- `test_permissions.py`: Environment verification script
|
225 |
|
226 |
+
**Testing**: Run the test script to verify the environment:
|
227 |
+
```bash
|
228 |
+
python test_permissions.py
|
229 |
+
```
|
230 |
|
231 |
+
**Expected Output**:
|
232 |
```
|
233 |
+
β
ALL TESTS PASSED
|
234 |
+
π All tests passed! The environment is ready for Docling.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
```
|
236 |
|
237 |
+
### Other Common Issues
|
238 |
+
|
239 |
+
#### Memory Issues
|
240 |
+
- **Problem**: Large PDF files may cause memory issues
|
241 |
+
- **Solution**: The application includes automatic cleanup of temporary files and memory management
|
242 |
+
|
243 |
+
#### Azure OpenAI Configuration
|
244 |
+
- **Problem**: Missing or incorrect Azure OpenAI credentials
|
245 |
+
- **Solution**: Ensure `.env` file contains:
|
246 |
+
```
|
247 |
+
AZURE_OPENAI_ENDPOINT=your_endpoint
|
248 |
+
AZURE_OPENAI_KEY=your_key
|
249 |
+
AZURE_OPENAI_VERSION=your_version
|
250 |
+
AZURE_OPENAI_DEPLOYMENT=your_deployment
|
251 |
+
```
|
252 |
+
|
253 |
+
#### File Upload Issues
|
254 |
+
- **Problem**: Files not uploading or processing
|
255 |
+
- **Solution**: Check file size limits and ensure PDF format is supported
|
256 |
+
|
257 |
+
## π§ Development and Deployment
|
258 |
+
|
259 |
+
### Local Development
|
260 |
+
1. Clone the repository
|
261 |
+
2. Install dependencies: `pip install -r requirements.txt`
|
262 |
+
3. Set up environment variables in `.env`
|
263 |
+
4. Run the test script: `python test_permissions.py`
|
264 |
+
5. Start the app: `streamlit run src/streamlit_app.py`
|
265 |
+
|
266 |
+
### Hugging Face Spaces Deployment
|
267 |
+
1. Push code to repository
|
268 |
+
2. Ensure `Dockerfile` is present
|
269 |
+
3. Set environment variables in Spaces settings
|
270 |
+
4. Deploy and monitor logs for any issues
|
271 |
|
272 |
### Environment Variables
|
273 |
+
The application uses these environment variables to control cache directories:
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
|
|
275 |
```bash
|
276 |
+
# Core temp directory
|
277 |
+
TEMP_DIR=/tmp/docling_temp
|
278 |
+
|
279 |
+
# Hugging Face Hub
|
280 |
+
HF_HOME=/tmp/docling_temp/huggingface
|
281 |
+
HF_CACHE_HOME=/tmp/docling_temp/huggingface_cache
|
282 |
+
HF_HUB_CACHE=/tmp/docling_temp/huggingface_cache
|
283 |
+
|
284 |
+
# ML Libraries
|
285 |
+
TRANSFORMERS_CACHE=/tmp/docling_temp/transformers_cache
|
286 |
+
HF_DATASETS_CACHE=/tmp/docling_temp/datasets_cache
|
287 |
+
TORCH_HOME=/tmp/docling_temp/torch
|
288 |
+
TENSORFLOW_HOME=/tmp/docling_temp/tensorflow
|
289 |
+
KERAS_HOME=/tmp/docling_temp/keras
|
290 |
+
|
291 |
+
# XDG Directories
|
292 |
+
XDG_CACHE_HOME=/tmp/docling_temp/cache
|
293 |
+
XDG_CONFIG_HOME=/tmp/docling_temp/config
|
294 |
+
XDG_DATA_HOME=/tmp/docling_temp/data
|
295 |
```
|
296 |
|
297 |
+
## π Performance and Monitoring
|
298 |
+
|
299 |
+
### Memory Management
|
300 |
+
- Automatic cleanup of temporary files
|
301 |
+
- Session state management
|
302 |
+
- File size monitoring
|
303 |
|
304 |
+
### Logging
|
305 |
+
- Comprehensive logging throughout the application
|
306 |
+
- In-memory log capture for UI display
|
307 |
+
- Error tracking and debugging information
|
308 |
|
309 |
+
### Caching
|
310 |
+
- Hugging Face model caching in temp directories
|
311 |
+
- Document processing result caching
|
312 |
+
- Session state persistence
|
313 |
|
src/processing/document_processor.py
CHANGED
@@ -4,11 +4,13 @@ import logging
|
|
4 |
import json
|
5 |
from dataclasses import dataclass
|
6 |
from typing import Optional
|
7 |
-
|
|
|
|
|
8 |
from processing.sections import SectionExtractor
|
9 |
|
10 |
-
#
|
11 |
-
_docling_converter = DocumentConverter()
|
12 |
|
13 |
logger = logging.getLogger(__name__) # Logger for this module
|
14 |
|
@@ -27,16 +29,30 @@ class DocumentProcessor:
|
|
27 |
"""
|
28 |
Initialize with an optional SectionExtractor for removing specific sections.
|
29 |
If None, no redaction will be performed (original structure only).
|
30 |
-
The Docling DocumentConverter
|
31 |
"""
|
32 |
self.section_extractor = section_extractor
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
def process(self, file_path: str) -> DocumentResult:
|
37 |
"""Parse the document and optionally remove specified sections. Returns a DocumentResult."""
|
38 |
logger.info(f"Starting processing for file: {file_path}")
|
39 |
start_time = time.time()
|
|
|
|
|
|
|
|
|
40 |
# Convert the document using Docling
|
41 |
conv_result = self.converter.convert(file_path)
|
42 |
elapsed = time.time() - start_time
|
@@ -100,6 +116,28 @@ class DocumentProcessor:
|
|
100 |
logger.info(f"Finished processing for file: {file_path}")
|
101 |
return result
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
def _export_redacted_markdown(self, document, redacted_json):
|
104 |
"""Export redacted markdown using the redacted JSON structure."""
|
105 |
# Simply convert the redacted JSON back to markdown
|
|
|
4 |
import json
|
5 |
from dataclasses import dataclass
|
6 |
from typing import Optional
|
7 |
+
|
8 |
+
# Don't import DocumentConverter at module level to prevent early initialization
|
9 |
+
# from docling.document_converter import DocumentConverter
|
10 |
from processing.sections import SectionExtractor
|
11 |
|
12 |
+
# Remove global converter initialization - will be done lazily
|
13 |
+
# _docling_converter = DocumentConverter()
|
14 |
|
15 |
logger = logging.getLogger(__name__) # Logger for this module
|
16 |
|
|
|
29 |
"""
|
30 |
Initialize with an optional SectionExtractor for removing specific sections.
|
31 |
If None, no redaction will be performed (original structure only).
|
32 |
+
The Docling DocumentConverter will be initialized lazily when needed.
|
33 |
"""
|
34 |
self.section_extractor = section_extractor
|
35 |
+
self._converter = None # Lazy initialization
|
36 |
+
|
37 |
+
@property
|
38 |
+
def converter(self):
|
39 |
+
"""Lazy initialization of DocumentConverter to prevent early Hugging Face Hub initialization."""
|
40 |
+
if self._converter is None:
|
41 |
+
# Import here to ensure environment variables are set first
|
42 |
+
from docling.document_converter import DocumentConverter
|
43 |
+
logger.info("Initializing Docling DocumentConverter...")
|
44 |
+
self._converter = DocumentConverter()
|
45 |
+
logger.info("Docling DocumentConverter initialized successfully")
|
46 |
+
return self._converter
|
47 |
|
48 |
def process(self, file_path: str) -> DocumentResult:
|
49 |
"""Parse the document and optionally remove specified sections. Returns a DocumentResult."""
|
50 |
logger.info(f"Starting processing for file: {file_path}")
|
51 |
start_time = time.time()
|
52 |
+
|
53 |
+
# Ensure environment variables are set before processing
|
54 |
+
self._ensure_cache_directories()
|
55 |
+
|
56 |
# Convert the document using Docling
|
57 |
conv_result = self.converter.convert(file_path)
|
58 |
elapsed = time.time() - start_time
|
|
|
116 |
logger.info(f"Finished processing for file: {file_path}")
|
117 |
return result
|
118 |
|
119 |
+
def _ensure_cache_directories(self):
|
120 |
+
"""Ensure all necessary cache directories exist before processing."""
|
121 |
+
cache_dirs = [
|
122 |
+
os.environ.get('HF_HOME', '/tmp/docling_temp/huggingface'),
|
123 |
+
os.environ.get('HF_CACHE_HOME', '/tmp/docling_temp/huggingface_cache'),
|
124 |
+
os.environ.get('HF_HUB_CACHE', '/tmp/docling_temp/huggingface_cache'),
|
125 |
+
os.environ.get('TRANSFORMERS_CACHE', '/tmp/docling_temp/transformers_cache'),
|
126 |
+
os.environ.get('HF_DATASETS_CACHE', '/tmp/docling_temp/datasets_cache'),
|
127 |
+
os.environ.get('DIFFUSERS_CACHE', '/tmp/docling_temp/diffusers_cache'),
|
128 |
+
os.environ.get('ACCELERATE_CACHE', '/tmp/docling_temp/accelerate_cache'),
|
129 |
+
os.environ.get('TORCH_HOME', '/tmp/docling_temp/torch'),
|
130 |
+
os.environ.get('TENSORFLOW_HOME', '/tmp/docling_temp/tensorflow'),
|
131 |
+
os.environ.get('KERAS_HOME', '/tmp/docling_temp/keras'),
|
132 |
+
]
|
133 |
+
|
134 |
+
for cache_dir in cache_dirs:
|
135 |
+
try:
|
136 |
+
os.makedirs(cache_dir, exist_ok=True)
|
137 |
+
logger.debug(f"Ensured cache directory exists: {cache_dir}")
|
138 |
+
except Exception as e:
|
139 |
+
logger.warning(f"Could not create cache directory {cache_dir}: {e}")
|
140 |
+
|
141 |
def _export_redacted_markdown(self, document, redacted_json):
|
142 |
"""Export redacted markdown using the redacted JSON structure."""
|
143 |
# Simply convert the redacted JSON back to markdown
|
src/streamlit_app.py
CHANGED
@@ -1,21 +1,9 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import logging
|
3 |
-
import os
|
4 |
-
import tempfile
|
5 |
-
import shutil
|
6 |
-
from processing.document_processor import DocumentProcessor
|
7 |
-
from processing.sections import ReasoningSectionExtractor
|
8 |
-
from utils.logging_utils import get_log_handler
|
9 |
-
from dotenv import load_dotenv
|
10 |
-
import sys
|
11 |
-
import html
|
12 |
-
import difflib
|
13 |
-
import re
|
14 |
-
import time
|
15 |
-
|
16 |
# Set environment variables IMMEDIATELY to prevent root filesystem access
|
17 |
# This must happen before any other imports or operations
|
18 |
|
|
|
|
|
|
|
19 |
# Get a writable temp directory first
|
20 |
try:
|
21 |
TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
|
@@ -44,11 +32,19 @@ os.environ.update({
|
|
44 |
'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
|
45 |
'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),
|
46 |
|
47 |
-
# Hugging Face Hub configuration
|
48 |
'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
|
49 |
'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
|
|
|
50 |
'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
|
51 |
'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Other ML libraries
|
54 |
'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
|
@@ -69,13 +65,6 @@ os.environ.update({
|
|
69 |
'MODELS': os.path.join(TEMP_DIR, 'models'),
|
70 |
'DATA': os.path.join(TEMP_DIR, 'data'),
|
71 |
'CONFIG': os.path.join(TEMP_DIR, 'config'),
|
72 |
-
|
73 |
-
# Specific cache overrides
|
74 |
-
'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
|
75 |
-
'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
|
76 |
-
'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
|
77 |
-
'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
|
78 |
-
'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),
|
79 |
})
|
80 |
|
81 |
# Create all necessary directories
|
@@ -125,6 +114,20 @@ for directory in directories_to_create:
|
|
125 |
except Exception as e:
|
126 |
print(f"Warning: Could not create directory {directory}: {e}")
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
# Configure logging early to avoid issues
|
129 |
logging.basicConfig(
|
130 |
level=logging.INFO,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Set environment variables IMMEDIATELY to prevent root filesystem access
|
2 |
# This must happen before any other imports or operations
|
3 |
|
4 |
+
import os
|
5 |
+
import tempfile
|
6 |
+
|
7 |
# Get a writable temp directory first
|
8 |
try:
|
9 |
TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
|
|
|
32 |
'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
|
33 |
'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),
|
34 |
|
35 |
+
# Hugging Face Hub configuration - CRITICAL for preventing /.cache access
|
36 |
'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
|
37 |
'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
|
38 |
+
'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
|
39 |
'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
|
40 |
'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
|
41 |
+
'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
|
42 |
+
'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),
|
43 |
+
|
44 |
+
# Additional Hugging Face specific variables
|
45 |
+
'HF_HUB_DISABLE_TELEMETRY': '1',
|
46 |
+
'HF_HUB_DISABLE_IMPLICIT_TOKEN': '1',
|
47 |
+
'HF_HUB_OFFLINE': '0',
|
48 |
|
49 |
# Other ML libraries
|
50 |
'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
|
|
|
65 |
'MODELS': os.path.join(TEMP_DIR, 'models'),
|
66 |
'DATA': os.path.join(TEMP_DIR, 'data'),
|
67 |
'CONFIG': os.path.join(TEMP_DIR, 'config'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
})
|
69 |
|
70 |
# Create all necessary directories
|
|
|
114 |
except Exception as e:
|
115 |
print(f"Warning: Could not create directory {directory}: {e}")
|
116 |
|
117 |
+
# Now import the rest of the modules
|
118 |
+
import streamlit as st
|
119 |
+
import logging
|
120 |
+
import shutil
|
121 |
+
from processing.document_processor import DocumentProcessor
|
122 |
+
from processing.sections import ReasoningSectionExtractor
|
123 |
+
from utils.logging_utils import get_log_handler
|
124 |
+
from dotenv import load_dotenv
|
125 |
+
import sys
|
126 |
+
import html
|
127 |
+
import difflib
|
128 |
+
import re
|
129 |
+
import time
|
130 |
+
|
131 |
# Configure logging early to avoid issues
|
132 |
logging.basicConfig(
|
133 |
level=logging.INFO,
|
test_permissions.py
CHANGED
@@ -1,80 +1,174 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
Test script to
|
|
|
4 |
"""
|
5 |
|
6 |
import os
|
7 |
import tempfile
|
8 |
-
import
|
9 |
|
10 |
-
def
|
11 |
-
"""Test
|
12 |
-
print("
|
|
|
|
|
13 |
|
14 |
-
#
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
try:
|
17 |
-
temp_dir = os.path.join(tempfile.gettempdir(), "docling_test")
|
18 |
os.makedirs(temp_dir, exist_ok=True)
|
19 |
-
test_file = os.path.join(temp_dir,
|
20 |
-
with open(test_file,
|
21 |
-
f.write(
|
22 |
os.remove(test_file)
|
23 |
-
|
24 |
-
|
25 |
except Exception as e:
|
26 |
-
print(f"β
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
#
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
#
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
test_file = os.path.join(tmp_dir, "test.txt")
|
49 |
-
with open(test_file, "w") as f:
|
50 |
-
f.write("test")
|
51 |
-
os.remove(test_file)
|
52 |
-
os.rmdir(tmp_dir)
|
53 |
-
print(f"β
Success: {tmp_dir}")
|
54 |
-
except Exception as e:
|
55 |
-
print(f"β Failed: {e}")
|
56 |
|
57 |
-
#
|
58 |
-
print("\
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
value = os.environ.get(var, 'NOT_SET')
|
66 |
-
print(f" {var}: {value}")
|
67 |
|
68 |
-
|
69 |
-
print("\
|
70 |
-
|
71 |
-
|
72 |
-
print(
|
73 |
-
|
74 |
-
|
75 |
-
print(
|
76 |
-
|
77 |
-
print(f"β Failed: {e}")
|
78 |
|
79 |
if __name__ == "__main__":
|
80 |
-
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
Test script to verify environment variables and cache directory permissions.
|
4 |
+
This should be run before the main application to ensure everything is set up correctly.
|
5 |
"""
|
6 |
|
7 |
import os
|
8 |
import tempfile
|
9 |
+
import sys
|
10 |
|
11 |
+
def test_environment_setup():
|
12 |
+
"""Test that environment variables are set correctly."""
|
13 |
+
print("=" * 60)
|
14 |
+
print("Testing Environment Setup")
|
15 |
+
print("=" * 60)
|
16 |
|
17 |
+
# Check critical environment variables
|
18 |
+
critical_vars = [
|
19 |
+
'HF_HOME',
|
20 |
+
'HF_CACHE_HOME',
|
21 |
+
'HF_HUB_CACHE',
|
22 |
+
'TRANSFORMERS_CACHE',
|
23 |
+
'HF_DATASETS_CACHE',
|
24 |
+
'TEMP_DIR',
|
25 |
+
'HOME',
|
26 |
+
'TMPDIR'
|
27 |
+
]
|
28 |
+
|
29 |
+
all_good = True
|
30 |
+
for var in critical_vars:
|
31 |
+
value = os.environ.get(var)
|
32 |
+
if value:
|
33 |
+
print(f"β
{var}: {value}")
|
34 |
+
else:
|
35 |
+
print(f"β {var}: NOT SET")
|
36 |
+
all_good = False
|
37 |
+
|
38 |
+
return all_good
|
39 |
+
|
40 |
+
def test_cache_directories():
|
41 |
+
"""Test that cache directories can be created and accessed."""
|
42 |
+
print("\n" + "=" * 60)
|
43 |
+
print("Testing Cache Directory Access")
|
44 |
+
print("=" * 60)
|
45 |
+
|
46 |
+
cache_dirs = [
|
47 |
+
os.environ.get('HF_HOME', '/tmp/docling_temp/huggingface'),
|
48 |
+
os.environ.get('HF_CACHE_HOME', '/tmp/docling_temp/huggingface_cache'),
|
49 |
+
os.environ.get('HF_HUB_CACHE', '/tmp/docling_temp/huggingface_cache'),
|
50 |
+
os.environ.get('TRANSFORMERS_CACHE', '/tmp/docling_temp/transformers_cache'),
|
51 |
+
os.environ.get('HF_DATASETS_CACHE', '/tmp/docling_temp/datasets_cache'),
|
52 |
+
os.environ.get('TORCH_HOME', '/tmp/docling_temp/torch'),
|
53 |
+
os.environ.get('TENSORFLOW_HOME', '/tmp/docling_temp/tensorflow'),
|
54 |
+
os.environ.get('KERAS_HOME', '/tmp/docling_temp/keras'),
|
55 |
+
]
|
56 |
+
|
57 |
+
all_good = True
|
58 |
+
for cache_dir in cache_dirs:
|
59 |
+
try:
|
60 |
+
os.makedirs(cache_dir, exist_ok=True)
|
61 |
+
# Test writing a file
|
62 |
+
test_file = os.path.join(cache_dir, 'test_write.txt')
|
63 |
+
with open(test_file, 'w') as f:
|
64 |
+
f.write('test')
|
65 |
+
os.remove(test_file)
|
66 |
+
print(f"β
{cache_dir}: WRITABLE")
|
67 |
+
except Exception as e:
|
68 |
+
print(f"β {cache_dir}: ERROR - {e}")
|
69 |
+
all_good = False
|
70 |
+
|
71 |
+
return all_good
|
72 |
+
|
73 |
+
def test_root_filesystem_access():
|
74 |
+
"""Test that we cannot access root filesystem."""
|
75 |
+
print("\n" + "=" * 60)
|
76 |
+
print("Testing Root Filesystem Access Prevention")
|
77 |
+
print("=" * 60)
|
78 |
+
|
79 |
+
root_paths = [
|
80 |
+
'/.cache',
|
81 |
+
'/root',
|
82 |
+
'/etc/test',
|
83 |
+
'/var/test'
|
84 |
+
]
|
85 |
+
|
86 |
+
all_good = True
|
87 |
+
for path in root_paths:
|
88 |
+
try:
|
89 |
+
os.makedirs(path, exist_ok=True)
|
90 |
+
print(f"β {path}: SUCCESSFULLY CREATED (SHOULD FAIL)")
|
91 |
+
all_good = False
|
92 |
+
except PermissionError:
|
93 |
+
print(f"β
{path}: PERMISSION DENIED (GOOD)")
|
94 |
+
except Exception as e:
|
95 |
+
print(f"β οΈ {path}: OTHER ERROR - {e}")
|
96 |
+
|
97 |
+
return all_good
|
98 |
+
|
99 |
+
def test_temp_directory():
|
100 |
+
"""Test temp directory access."""
|
101 |
+
print("\n" + "=" * 60)
|
102 |
+
print("Testing Temp Directory Access")
|
103 |
+
print("=" * 60)
|
104 |
+
|
105 |
+
temp_dir = os.environ.get('TEMP_DIR', '/tmp/docling_temp')
|
106 |
try:
|
|
|
107 |
os.makedirs(temp_dir, exist_ok=True)
|
108 |
+
test_file = os.path.join(temp_dir, 'test_temp.txt')
|
109 |
+
with open(test_file, 'w') as f:
|
110 |
+
f.write('temp test')
|
111 |
os.remove(test_file)
|
112 |
+
print(f"β
{temp_dir}: WRITABLE")
|
113 |
+
return True
|
114 |
except Exception as e:
|
115 |
+
print(f"β {temp_dir}: ERROR - {e}")
|
116 |
+
return False
|
117 |
+
|
118 |
+
def main():
|
119 |
+
"""Run all tests."""
|
120 |
+
print("Docling Environment and Permission Test")
|
121 |
+
print("This script tests that the environment is set up correctly for Hugging Face Spaces")
|
122 |
|
123 |
+
# Set environment variables if not already set
|
124 |
+
if not os.environ.get('TEMP_DIR'):
|
125 |
+
temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp")
|
126 |
+
os.environ.update({
|
127 |
+
'TEMP_DIR': temp_dir,
|
128 |
+
'HOME': temp_dir,
|
129 |
+
'USERPROFILE': temp_dir,
|
130 |
+
'TMPDIR': temp_dir,
|
131 |
+
'TEMP': temp_dir,
|
132 |
+
'TMP': temp_dir,
|
133 |
+
'HF_HOME': os.path.join(temp_dir, 'huggingface'),
|
134 |
+
'HF_CACHE_HOME': os.path.join(temp_dir, 'huggingface_cache'),
|
135 |
+
'HF_HUB_CACHE': os.path.join(temp_dir, 'huggingface_cache'),
|
136 |
+
'TRANSFORMERS_CACHE': os.path.join(temp_dir, 'transformers_cache'),
|
137 |
+
'HF_DATASETS_CACHE': os.path.join(temp_dir, 'datasets_cache'),
|
138 |
+
'DIFFUSERS_CACHE': os.path.join(temp_dir, 'diffusers_cache'),
|
139 |
+
'ACCELERATE_CACHE': os.path.join(temp_dir, 'accelerate_cache'),
|
140 |
+
'TORCH_HOME': os.path.join(temp_dir, 'torch'),
|
141 |
+
'TENSORFLOW_HOME': os.path.join(temp_dir, 'tensorflow'),
|
142 |
+
'KERAS_HOME': os.path.join(temp_dir, 'keras'),
|
143 |
+
'XDG_CACHE_HOME': os.path.join(temp_dir, 'cache'),
|
144 |
+
'XDG_CONFIG_HOME': os.path.join(temp_dir, 'config'),
|
145 |
+
'XDG_DATA_HOME': os.path.join(temp_dir, 'data'),
|
146 |
+
})
|
147 |
|
148 |
+
# Run tests
|
149 |
+
env_ok = test_environment_setup()
|
150 |
+
cache_ok = test_cache_directories()
|
151 |
+
root_ok = test_root_filesystem_access()
|
152 |
+
temp_ok = test_temp_directory()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
+
# Summary
|
155 |
+
print("\n" + "=" * 60)
|
156 |
+
print("TEST SUMMARY")
|
157 |
+
print("=" * 60)
|
158 |
+
print(f"Environment Variables: {'β
PASS' if env_ok else 'β FAIL'}")
|
159 |
+
print(f"Cache Directories: {'β
PASS' if cache_ok else 'β FAIL'}")
|
160 |
+
print(f"Root Access Prevention: {'β
PASS' if root_ok else 'β FAIL'}")
|
161 |
+
print(f"Temp Directory: {'β
PASS' if temp_ok else 'β FAIL'}")
|
|
|
|
|
162 |
|
163 |
+
overall_success = env_ok and cache_ok and root_ok and temp_ok
|
164 |
+
print(f"\nOverall Result: {'β
ALL TESTS PASSED' if overall_success else 'β SOME TESTS FAILED'}")
|
165 |
+
|
166 |
+
if not overall_success:
|
167 |
+
print("\nβ οΈ Some tests failed. Please check the environment setup.")
|
168 |
+
sys.exit(1)
|
169 |
+
else:
|
170 |
+
print("\nπ All tests passed! The environment is ready for Docling.")
|
171 |
+
sys.exit(0)
|
|
|
172 |
|
173 |
if __name__ == "__main__":
|
174 |
+
main()
|