Spaces:
Sleeping
Sleeping
Commit
·
3db2fae
1
Parent(s):
5a08ed8
Implement comprehensive environment variable setup and directory management for Streamlit app
Browse files- Added extensive environment variable configurations to prevent root filesystem access and ensure proper functioning of ML libraries.
- Implemented a robust directory creation process with error handling and fallback mechanisms for various library paths.
- Enhanced logging to provide detailed feedback on directory creation and environment variable settings, improving overall stability and user experience.
- src/streamlit_app.py +113 -114
src/streamlit_app.py
CHANGED
@@ -13,6 +13,118 @@ import difflib
|
|
13 |
import re
|
14 |
import time
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# Configure logging early to avoid issues
|
17 |
logging.basicConfig(
|
18 |
level=logging.INFO,
|
@@ -29,125 +141,12 @@ AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
|
29 |
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
|
30 |
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
31 |
|
32 |
-
# Use system temp directory instead of local directory to avoid permission issues
|
33 |
-
TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
|
34 |
-
try:
|
35 |
-
os.makedirs(TEMP_DIR, exist_ok=True)
|
36 |
-
logging.info(f"Using temp directory: {TEMP_DIR}")
|
37 |
-
except PermissionError as e:
|
38 |
-
logging.warning(f"Permission error creating temp dir {TEMP_DIR}: {e}")
|
39 |
-
# Fallback to a subdirectory of the current working directory if temp dir fails
|
40 |
-
TEMP_DIR = os.path.join(os.getcwd(), "temp_files")
|
41 |
-
try:
|
42 |
-
os.makedirs(TEMP_DIR, exist_ok=True)
|
43 |
-
logging.info(f"Using fallback temp directory: {TEMP_DIR}")
|
44 |
-
except PermissionError as e2:
|
45 |
-
logging.warning(f"Permission error creating fallback temp dir {TEMP_DIR}: {e2}")
|
46 |
-
# Last resort: use a directory that should be writable
|
47 |
-
TEMP_DIR = "/tmp/docling_temp"
|
48 |
-
try:
|
49 |
-
os.makedirs(TEMP_DIR, exist_ok=True)
|
50 |
-
logging.info(f"Using last resort temp directory: {TEMP_DIR}")
|
51 |
-
except Exception as e3:
|
52 |
-
logging.error(f"Failed to create any temp directory: {e3}")
|
53 |
-
# Use current directory as absolute last resort
|
54 |
-
TEMP_DIR = "."
|
55 |
-
logging.warning(f"Using current directory as temp: {TEMP_DIR}")
|
56 |
-
|
57 |
-
# Configure Streamlit to use writable directories
|
58 |
-
os.environ['STREAMLIT_SERVER_FILE_WATCHER_TYPE'] = 'none'
|
59 |
-
os.environ['STREAMLIT_SERVER_HEADLESS'] = 'true'
|
60 |
-
os.environ['STREAMLIT_BROWSER_GATHER_USAGE_STATS'] = 'false'
|
61 |
-
|
62 |
-
# Additional environment variables for Hugging Face deployment
|
63 |
-
os.environ['STREAMLIT_SERVER_ENABLE_CORS'] = 'false'
|
64 |
-
os.environ['STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION'] = 'false'
|
65 |
-
|
66 |
-
# Configure EasyOCR to use writable directory for model storage
|
67 |
-
os.environ['EASYOCR_MODULE_PATH'] = os.path.join(TEMP_DIR, 'easyocr_models')
|
68 |
-
try:
|
69 |
-
os.makedirs(os.environ['EASYOCR_MODULE_PATH'], exist_ok=True)
|
70 |
-
logging.info(f"EasyOCR model directory: {os.environ['EASYOCR_MODULE_PATH']}")
|
71 |
-
except Exception as e:
|
72 |
-
logging.warning(f"Could not create EasyOCR model directory: {e}")
|
73 |
-
# Fallback to /tmp if the temp directory fails
|
74 |
-
os.environ['EASYOCR_MODULE_PATH'] = '/tmp/easyocr_models'
|
75 |
-
try:
|
76 |
-
os.makedirs(os.environ['EASYOCR_MODULE_PATH'], exist_ok=True)
|
77 |
-
logging.info(f"Using fallback EasyOCR model directory: {os.environ['EASYOCR_MODULE_PATH']}")
|
78 |
-
except Exception as e2:
|
79 |
-
logging.error(f"Failed to create EasyOCR model directory: {e2}")
|
80 |
-
# Last resort: use current directory
|
81 |
-
os.environ['EASYOCR_MODULE_PATH'] = os.path.join(os.getcwd(), 'easyocr_models')
|
82 |
-
os.makedirs(os.environ['EASYOCR_MODULE_PATH'], exist_ok=True)
|
83 |
-
logging.warning(f"Using current directory for EasyOCR models: {os.environ['EASYOCR_MODULE_PATH']}")
|
84 |
-
|
85 |
-
# Additional EasyOCR environment variables to prevent root directory access
|
86 |
-
os.environ['HOME'] = TEMP_DIR # Set HOME to temp directory
|
87 |
-
os.environ['USERPROFILE'] = TEMP_DIR # For Windows compatibility
|
88 |
-
os.environ['XDG_CACHE_HOME'] = os.path.join(TEMP_DIR, 'cache')
|
89 |
-
os.environ['XDG_CONFIG_HOME'] = os.path.join(TEMP_DIR, 'config')
|
90 |
-
os.environ['XDG_DATA_HOME'] = os.path.join(TEMP_DIR, 'data')
|
91 |
-
|
92 |
-
# Create additional directories that EasyOCR might need
|
93 |
-
for env_var in ['XDG_CACHE_HOME', 'XDG_CONFIG_HOME', 'XDG_DATA_HOME']:
|
94 |
-
try:
|
95 |
-
os.makedirs(os.environ[env_var], exist_ok=True)
|
96 |
-
logging.info(f"Created directory for {env_var}: {os.environ[env_var]}")
|
97 |
-
except Exception as e:
|
98 |
-
logging.warning(f"Could not create directory for {env_var}: {e}")
|
99 |
-
|
100 |
-
# Configure Hugging Face Hub to use writable directories
|
101 |
-
os.environ['HF_HOME'] = os.path.join(TEMP_DIR, 'huggingface')
|
102 |
-
os.environ['HF_CACHE_HOME'] = os.path.join(TEMP_DIR, 'huggingface_cache')
|
103 |
-
os.environ['TRANSFORMERS_CACHE'] = os.path.join(TEMP_DIR, 'transformers_cache')
|
104 |
-
os.environ['HF_DATASETS_CACHE'] = os.path.join(TEMP_DIR, 'datasets_cache')
|
105 |
-
|
106 |
-
# Create Hugging Face directories
|
107 |
-
hf_dirs = ['HF_HOME', 'HF_CACHE_HOME', 'TRANSFORMERS_CACHE', 'HF_DATASETS_CACHE']
|
108 |
-
for env_var in hf_dirs:
|
109 |
-
try:
|
110 |
-
os.makedirs(os.environ[env_var], exist_ok=True)
|
111 |
-
logging.info(f"Created Hugging Face directory for {env_var}: {os.environ[env_var]}")
|
112 |
-
except Exception as e:
|
113 |
-
logging.warning(f"Could not create Hugging Face directory for {env_var}: {e}")
|
114 |
-
# Fallback to /tmp if the temp directory fails
|
115 |
-
fallback_path = os.path.join('/tmp', env_var.lower())
|
116 |
-
os.environ[env_var] = fallback_path
|
117 |
-
try:
|
118 |
-
os.makedirs(fallback_path, exist_ok=True)
|
119 |
-
logging.info(f"Using fallback Hugging Face directory for {env_var}: {fallback_path}")
|
120 |
-
except Exception as e2:
|
121 |
-
logging.error(f"Failed to create fallback Hugging Face directory for {env_var}: {e2}")
|
122 |
-
|
123 |
-
# Additional environment variables for other libraries that might access root directories
|
124 |
-
os.environ['TORCH_HOME'] = os.path.join(TEMP_DIR, 'torch')
|
125 |
-
os.environ['TENSORFLOW_HOME'] = os.path.join(TEMP_DIR, 'tensorflow')
|
126 |
-
os.environ['KERAS_HOME'] = os.path.join(TEMP_DIR, 'keras')
|
127 |
-
os.environ['MLFLOW_TRACKING_URI'] = 'file:' + os.path.join(TEMP_DIR, 'mlruns')
|
128 |
-
|
129 |
-
# Create additional library directories
|
130 |
-
lib_dirs = ['TORCH_HOME', 'TENSORFLOW_HOME', 'KERAS_HOME']
|
131 |
-
for env_var in lib_dirs:
|
132 |
-
try:
|
133 |
-
os.makedirs(os.environ[env_var], exist_ok=True)
|
134 |
-
logging.info(f"Created library directory for {env_var}: {os.environ[env_var]}")
|
135 |
-
except Exception as e:
|
136 |
-
logging.warning(f"Could not create library directory for {env_var}: {e}")
|
137 |
-
# Fallback to /tmp
|
138 |
-
fallback_path = os.path.join('/tmp', env_var.lower())
|
139 |
-
os.environ[env_var] = fallback_path
|
140 |
-
try:
|
141 |
-
os.makedirs(fallback_path, exist_ok=True)
|
142 |
-
logging.info(f"Using fallback library directory for {env_var}: {fallback_path}")
|
143 |
-
except Exception as e2:
|
144 |
-
logging.error(f"Failed to create fallback library directory for {env_var}: {e2}")
|
145 |
-
|
146 |
# Log startup information
|
147 |
logging.info("=" * 50)
|
148 |
logging.info("Docling Streamlit App Starting")
|
149 |
logging.info(f"Temp directory: {TEMP_DIR}")
|
150 |
logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}")
|
|
|
151 |
logging.info(f"Current working directory: {os.getcwd()}")
|
152 |
logging.info(f"Python version: {sys.version}")
|
153 |
logging.info("=" * 50)
|
|
|
13 |
import re
|
14 |
import time
|
15 |
|
16 |
+
# Set environment variables IMMEDIATELY to prevent root filesystem access
|
17 |
+
# This must happen before any other imports or operations
|
18 |
+
|
19 |
+
# Get a writable temp directory first
|
20 |
+
try:
|
21 |
+
TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
|
22 |
+
os.makedirs(TEMP_DIR, exist_ok=True)
|
23 |
+
except Exception:
|
24 |
+
try:
|
25 |
+
TEMP_DIR = "/tmp/docling_temp"
|
26 |
+
os.makedirs(TEMP_DIR, exist_ok=True)
|
27 |
+
except Exception:
|
28 |
+
TEMP_DIR = os.getcwd()
|
29 |
+
|
30 |
+
# Set all environment variables that libraries might use
|
31 |
+
os.environ.update({
|
32 |
+
# Streamlit configuration
|
33 |
+
'STREAMLIT_SERVER_FILE_WATCHER_TYPE': 'none',
|
34 |
+
'STREAMLIT_SERVER_HEADLESS': 'true',
|
35 |
+
'STREAMLIT_BROWSER_GATHER_USAGE_STATS': 'false',
|
36 |
+
'STREAMLIT_SERVER_ENABLE_CORS': 'false',
|
37 |
+
'STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION': 'false',
|
38 |
+
|
39 |
+
# EasyOCR configuration
|
40 |
+
'EASYOCR_MODULE_PATH': os.path.join(TEMP_DIR, 'easyocr_models'),
|
41 |
+
'HOME': TEMP_DIR,
|
42 |
+
'USERPROFILE': TEMP_DIR,
|
43 |
+
'XDG_CACHE_HOME': os.path.join(TEMP_DIR, 'cache'),
|
44 |
+
'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
|
45 |
+
'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),
|
46 |
+
|
47 |
+
# Hugging Face Hub configuration
|
48 |
+
'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
|
49 |
+
'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
|
50 |
+
'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
|
51 |
+
'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
|
52 |
+
|
53 |
+
# Other ML libraries
|
54 |
+
'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
|
55 |
+
'TENSORFLOW_HOME': os.path.join(TEMP_DIR, 'tensorflow'),
|
56 |
+
'KERAS_HOME': os.path.join(TEMP_DIR, 'keras'),
|
57 |
+
'MLFLOW_TRACKING_URI': f'file:{os.path.join(TEMP_DIR, "mlruns")}',
|
58 |
+
|
59 |
+
# Additional cache directories
|
60 |
+
'CACHE_DIR': os.path.join(TEMP_DIR, 'cache'),
|
61 |
+
'MODEL_CACHE_DIR': os.path.join(TEMP_DIR, 'models'),
|
62 |
+
|
63 |
+
# Additional environment variables to prevent root access
|
64 |
+
'PYTHONPATH': TEMP_DIR,
|
65 |
+
'TMPDIR': TEMP_DIR,
|
66 |
+
'TEMP': TEMP_DIR,
|
67 |
+
'TMP': TEMP_DIR,
|
68 |
+
'CACHE': os.path.join(TEMP_DIR, 'cache'),
|
69 |
+
'MODELS': os.path.join(TEMP_DIR, 'models'),
|
70 |
+
'DATA': os.path.join(TEMP_DIR, 'data'),
|
71 |
+
'CONFIG': os.path.join(TEMP_DIR, 'config'),
|
72 |
+
|
73 |
+
# Specific cache overrides
|
74 |
+
'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
|
75 |
+
'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
|
76 |
+
'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
|
77 |
+
'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
|
78 |
+
'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),
|
79 |
+
})
|
80 |
+
|
81 |
+
# Create all necessary directories
|
82 |
+
directories_to_create = [
|
83 |
+
os.environ['EASYOCR_MODULE_PATH'],
|
84 |
+
os.environ['XDG_CACHE_HOME'],
|
85 |
+
os.environ['XDG_CONFIG_HOME'],
|
86 |
+
os.environ['XDG_DATA_HOME'],
|
87 |
+
os.environ['HF_HOME'],
|
88 |
+
os.environ['HF_CACHE_HOME'],
|
89 |
+
os.environ['TRANSFORMERS_CACHE'],
|
90 |
+
os.environ['HF_DATASETS_CACHE'],
|
91 |
+
os.environ['TORCH_HOME'],
|
92 |
+
os.environ['TENSORFLOW_HOME'],
|
93 |
+
os.environ['KERAS_HOME'],
|
94 |
+
os.environ['CACHE_DIR'],
|
95 |
+
os.environ['MODEL_CACHE_DIR'],
|
96 |
+
os.environ['CACHE'],
|
97 |
+
os.environ['MODELS'],
|
98 |
+
os.environ['DATA'],
|
99 |
+
os.environ['CONFIG'],
|
100 |
+
os.environ['HF_HUB_CACHE'],
|
101 |
+
os.environ['DIFFUSERS_CACHE'],
|
102 |
+
os.environ['ACCELERATE_CACHE'],
|
103 |
+
]
|
104 |
+
|
105 |
+
# Monkey patch os.makedirs to prevent root directory access
|
106 |
+
original_makedirs = os.makedirs
|
107 |
+
|
108 |
+
def safe_makedirs(name, mode=0o777, exist_ok=False):
|
109 |
+
"""Safe version of makedirs that prevents root directory access."""
|
110 |
+
# Check if trying to create directory in root filesystem
|
111 |
+
if name.startswith('/') and not name.startswith('/tmp') and not name.startswith('/app'):
|
112 |
+
# Redirect to temp directory
|
113 |
+
basename = os.path.basename(name)
|
114 |
+
safe_name = os.path.join(TEMP_DIR, basename)
|
115 |
+
print(f"Redirecting root directory creation from {name} to {safe_name}")
|
116 |
+
return original_makedirs(safe_name, mode, exist_ok)
|
117 |
+
return original_makedirs(name, mode, exist_ok)
|
118 |
+
|
119 |
+
# Apply the monkey patch
|
120 |
+
os.makedirs = safe_makedirs
|
121 |
+
|
122 |
+
for directory in directories_to_create:
|
123 |
+
try:
|
124 |
+
os.makedirs(directory, exist_ok=True)
|
125 |
+
except Exception as e:
|
126 |
+
print(f"Warning: Could not create directory {directory}: {e}")
|
127 |
+
|
128 |
# Configure logging early to avoid issues
|
129 |
logging.basicConfig(
|
130 |
level=logging.INFO,
|
|
|
141 |
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
|
142 |
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
# Log startup information
|
145 |
logging.info("=" * 50)
|
146 |
logging.info("Docling Streamlit App Starting")
|
147 |
logging.info(f"Temp directory: {TEMP_DIR}")
|
148 |
logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}")
|
149 |
+
logging.info(f"Hugging Face cache: {os.environ.get('HF_CACHE_HOME', 'NOT_SET')}")
|
150 |
logging.info(f"Current working directory: {os.getcwd()}")
|
151 |
logging.info(f"Python version: {sys.version}")
|
152 |
logging.info("=" * 50)
|