levalencia commited on
Commit
3db2fae
·
1 Parent(s): 5a08ed8

Implement comprehensive environment variable setup and directory management for Streamlit app

Browse files

- Added extensive environment variable configurations to prevent root filesystem access and ensure proper functioning of ML libraries.
- Implemented a robust directory creation process with error handling and fallback mechanisms for various library paths.
- Enhanced logging to provide detailed feedback on directory creation and environment variable settings, improving overall stability and user experience.

Files changed (1) hide show
  1. src/streamlit_app.py +113 -114
src/streamlit_app.py CHANGED
@@ -13,6 +13,118 @@ import difflib
13
  import re
14
  import time
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # Configure logging early to avoid issues
17
  logging.basicConfig(
18
  level=logging.INFO,
@@ -29,125 +141,12 @@ AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
29
  AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
30
  AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
31
 
32
- # Use system temp directory instead of local directory to avoid permission issues
33
- TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
34
- try:
35
- os.makedirs(TEMP_DIR, exist_ok=True)
36
- logging.info(f"Using temp directory: {TEMP_DIR}")
37
- except PermissionError as e:
38
- logging.warning(f"Permission error creating temp dir {TEMP_DIR}: {e}")
39
- # Fallback to a subdirectory of the current working directory if temp dir fails
40
- TEMP_DIR = os.path.join(os.getcwd(), "temp_files")
41
- try:
42
- os.makedirs(TEMP_DIR, exist_ok=True)
43
- logging.info(f"Using fallback temp directory: {TEMP_DIR}")
44
- except PermissionError as e2:
45
- logging.warning(f"Permission error creating fallback temp dir {TEMP_DIR}: {e2}")
46
- # Last resort: use a directory that should be writable
47
- TEMP_DIR = "/tmp/docling_temp"
48
- try:
49
- os.makedirs(TEMP_DIR, exist_ok=True)
50
- logging.info(f"Using last resort temp directory: {TEMP_DIR}")
51
- except Exception as e3:
52
- logging.error(f"Failed to create any temp directory: {e3}")
53
- # Use current directory as absolute last resort
54
- TEMP_DIR = "."
55
- logging.warning(f"Using current directory as temp: {TEMP_DIR}")
56
-
57
- # Configure Streamlit to use writable directories
58
- os.environ['STREAMLIT_SERVER_FILE_WATCHER_TYPE'] = 'none'
59
- os.environ['STREAMLIT_SERVER_HEADLESS'] = 'true'
60
- os.environ['STREAMLIT_BROWSER_GATHER_USAGE_STATS'] = 'false'
61
-
62
- # Additional environment variables for Hugging Face deployment
63
- os.environ['STREAMLIT_SERVER_ENABLE_CORS'] = 'false'
64
- os.environ['STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION'] = 'false'
65
-
66
- # Configure EasyOCR to use writable directory for model storage
67
- os.environ['EASYOCR_MODULE_PATH'] = os.path.join(TEMP_DIR, 'easyocr_models')
68
- try:
69
- os.makedirs(os.environ['EASYOCR_MODULE_PATH'], exist_ok=True)
70
- logging.info(f"EasyOCR model directory: {os.environ['EASYOCR_MODULE_PATH']}")
71
- except Exception as e:
72
- logging.warning(f"Could not create EasyOCR model directory: {e}")
73
- # Fallback to /tmp if the temp directory fails
74
- os.environ['EASYOCR_MODULE_PATH'] = '/tmp/easyocr_models'
75
- try:
76
- os.makedirs(os.environ['EASYOCR_MODULE_PATH'], exist_ok=True)
77
- logging.info(f"Using fallback EasyOCR model directory: {os.environ['EASYOCR_MODULE_PATH']}")
78
- except Exception as e2:
79
- logging.error(f"Failed to create EasyOCR model directory: {e2}")
80
- # Last resort: use current directory
81
- os.environ['EASYOCR_MODULE_PATH'] = os.path.join(os.getcwd(), 'easyocr_models')
82
- os.makedirs(os.environ['EASYOCR_MODULE_PATH'], exist_ok=True)
83
- logging.warning(f"Using current directory for EasyOCR models: {os.environ['EASYOCR_MODULE_PATH']}")
84
-
85
- # Additional EasyOCR environment variables to prevent root directory access
86
- os.environ['HOME'] = TEMP_DIR # Set HOME to temp directory
87
- os.environ['USERPROFILE'] = TEMP_DIR # For Windows compatibility
88
- os.environ['XDG_CACHE_HOME'] = os.path.join(TEMP_DIR, 'cache')
89
- os.environ['XDG_CONFIG_HOME'] = os.path.join(TEMP_DIR, 'config')
90
- os.environ['XDG_DATA_HOME'] = os.path.join(TEMP_DIR, 'data')
91
-
92
- # Create additional directories that EasyOCR might need
93
- for env_var in ['XDG_CACHE_HOME', 'XDG_CONFIG_HOME', 'XDG_DATA_HOME']:
94
- try:
95
- os.makedirs(os.environ[env_var], exist_ok=True)
96
- logging.info(f"Created directory for {env_var}: {os.environ[env_var]}")
97
- except Exception as e:
98
- logging.warning(f"Could not create directory for {env_var}: {e}")
99
-
100
- # Configure Hugging Face Hub to use writable directories
101
- os.environ['HF_HOME'] = os.path.join(TEMP_DIR, 'huggingface')
102
- os.environ['HF_CACHE_HOME'] = os.path.join(TEMP_DIR, 'huggingface_cache')
103
- os.environ['TRANSFORMERS_CACHE'] = os.path.join(TEMP_DIR, 'transformers_cache')
104
- os.environ['HF_DATASETS_CACHE'] = os.path.join(TEMP_DIR, 'datasets_cache')
105
-
106
- # Create Hugging Face directories
107
- hf_dirs = ['HF_HOME', 'HF_CACHE_HOME', 'TRANSFORMERS_CACHE', 'HF_DATASETS_CACHE']
108
- for env_var in hf_dirs:
109
- try:
110
- os.makedirs(os.environ[env_var], exist_ok=True)
111
- logging.info(f"Created Hugging Face directory for {env_var}: {os.environ[env_var]}")
112
- except Exception as e:
113
- logging.warning(f"Could not create Hugging Face directory for {env_var}: {e}")
114
- # Fallback to /tmp if the temp directory fails
115
- fallback_path = os.path.join('/tmp', env_var.lower())
116
- os.environ[env_var] = fallback_path
117
- try:
118
- os.makedirs(fallback_path, exist_ok=True)
119
- logging.info(f"Using fallback Hugging Face directory for {env_var}: {fallback_path}")
120
- except Exception as e2:
121
- logging.error(f"Failed to create fallback Hugging Face directory for {env_var}: {e2}")
122
-
123
- # Additional environment variables for other libraries that might access root directories
124
- os.environ['TORCH_HOME'] = os.path.join(TEMP_DIR, 'torch')
125
- os.environ['TENSORFLOW_HOME'] = os.path.join(TEMP_DIR, 'tensorflow')
126
- os.environ['KERAS_HOME'] = os.path.join(TEMP_DIR, 'keras')
127
- os.environ['MLFLOW_TRACKING_URI'] = 'file:' + os.path.join(TEMP_DIR, 'mlruns')
128
-
129
- # Create additional library directories
130
- lib_dirs = ['TORCH_HOME', 'TENSORFLOW_HOME', 'KERAS_HOME']
131
- for env_var in lib_dirs:
132
- try:
133
- os.makedirs(os.environ[env_var], exist_ok=True)
134
- logging.info(f"Created library directory for {env_var}: {os.environ[env_var]}")
135
- except Exception as e:
136
- logging.warning(f"Could not create library directory for {env_var}: {e}")
137
- # Fallback to /tmp
138
- fallback_path = os.path.join('/tmp', env_var.lower())
139
- os.environ[env_var] = fallback_path
140
- try:
141
- os.makedirs(fallback_path, exist_ok=True)
142
- logging.info(f"Using fallback library directory for {env_var}: {fallback_path}")
143
- except Exception as e2:
144
- logging.error(f"Failed to create fallback library directory for {env_var}: {e2}")
145
-
146
  # Log startup information
147
  logging.info("=" * 50)
148
  logging.info("Docling Streamlit App Starting")
149
  logging.info(f"Temp directory: {TEMP_DIR}")
150
  logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}")
 
151
  logging.info(f"Current working directory: {os.getcwd()}")
152
  logging.info(f"Python version: {sys.version}")
153
  logging.info("=" * 50)
 
13
  import re
14
  import time
15
 
16
+ # Set environment variables IMMEDIATELY to prevent root filesystem access
17
+ # This must happen before any other imports or operations
18
+
19
+ # Get a writable temp directory first
20
+ try:
21
+ TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
22
+ os.makedirs(TEMP_DIR, exist_ok=True)
23
+ except Exception:
24
+ try:
25
+ TEMP_DIR = "/tmp/docling_temp"
26
+ os.makedirs(TEMP_DIR, exist_ok=True)
27
+ except Exception:
28
+ TEMP_DIR = os.getcwd()
29
+
30
+ # Set all environment variables that libraries might use
31
+ os.environ.update({
32
+ # Streamlit configuration
33
+ 'STREAMLIT_SERVER_FILE_WATCHER_TYPE': 'none',
34
+ 'STREAMLIT_SERVER_HEADLESS': 'true',
35
+ 'STREAMLIT_BROWSER_GATHER_USAGE_STATS': 'false',
36
+ 'STREAMLIT_SERVER_ENABLE_CORS': 'false',
37
+ 'STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION': 'false',
38
+
39
+ # EasyOCR configuration
40
+ 'EASYOCR_MODULE_PATH': os.path.join(TEMP_DIR, 'easyocr_models'),
41
+ 'HOME': TEMP_DIR,
42
+ 'USERPROFILE': TEMP_DIR,
43
+ 'XDG_CACHE_HOME': os.path.join(TEMP_DIR, 'cache'),
44
+ 'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
45
+ 'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),
46
+
47
+ # Hugging Face Hub configuration
48
+ 'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
49
+ 'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
50
+ 'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
51
+ 'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
52
+
53
+ # Other ML libraries
54
+ 'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
55
+ 'TENSORFLOW_HOME': os.path.join(TEMP_DIR, 'tensorflow'),
56
+ 'KERAS_HOME': os.path.join(TEMP_DIR, 'keras'),
57
+ 'MLFLOW_TRACKING_URI': f'file:{os.path.join(TEMP_DIR, "mlruns")}',
58
+
59
+ # Additional cache directories
60
+ 'CACHE_DIR': os.path.join(TEMP_DIR, 'cache'),
61
+ 'MODEL_CACHE_DIR': os.path.join(TEMP_DIR, 'models'),
62
+
63
+ # Additional environment variables to prevent root access
64
+ 'PYTHONPATH': TEMP_DIR,
65
+ 'TMPDIR': TEMP_DIR,
66
+ 'TEMP': TEMP_DIR,
67
+ 'TMP': TEMP_DIR,
68
+ 'CACHE': os.path.join(TEMP_DIR, 'cache'),
69
+ 'MODELS': os.path.join(TEMP_DIR, 'models'),
70
+ 'DATA': os.path.join(TEMP_DIR, 'data'),
71
+ 'CONFIG': os.path.join(TEMP_DIR, 'config'),
72
+
73
+ # Specific cache overrides
74
+ 'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
75
+ 'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
76
+ 'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
77
+ 'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
78
+ 'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),
79
+ })
80
+
81
+ # Create all necessary directories
82
+ directories_to_create = [
83
+ os.environ['EASYOCR_MODULE_PATH'],
84
+ os.environ['XDG_CACHE_HOME'],
85
+ os.environ['XDG_CONFIG_HOME'],
86
+ os.environ['XDG_DATA_HOME'],
87
+ os.environ['HF_HOME'],
88
+ os.environ['HF_CACHE_HOME'],
89
+ os.environ['TRANSFORMERS_CACHE'],
90
+ os.environ['HF_DATASETS_CACHE'],
91
+ os.environ['TORCH_HOME'],
92
+ os.environ['TENSORFLOW_HOME'],
93
+ os.environ['KERAS_HOME'],
94
+ os.environ['CACHE_DIR'],
95
+ os.environ['MODEL_CACHE_DIR'],
96
+ os.environ['CACHE'],
97
+ os.environ['MODELS'],
98
+ os.environ['DATA'],
99
+ os.environ['CONFIG'],
100
+ os.environ['HF_HUB_CACHE'],
101
+ os.environ['DIFFUSERS_CACHE'],
102
+ os.environ['ACCELERATE_CACHE'],
103
+ ]
104
+
105
+ # Monkey patch os.makedirs to prevent root directory access
106
+ original_makedirs = os.makedirs
107
+
108
+ def safe_makedirs(name, mode=0o777, exist_ok=False):
109
+ """Safe version of makedirs that prevents root directory access."""
110
+ # Check if trying to create directory in root filesystem
111
+ if name.startswith('/') and not name.startswith('/tmp') and not name.startswith('/app'):
112
+ # Redirect to temp directory
113
+ basename = os.path.basename(name)
114
+ safe_name = os.path.join(TEMP_DIR, basename)
115
+ print(f"Redirecting root directory creation from {name} to {safe_name}")
116
+ return original_makedirs(safe_name, mode, exist_ok)
117
+ return original_makedirs(name, mode, exist_ok)
118
+
119
+ # Apply the monkey patch
120
+ os.makedirs = safe_makedirs
121
+
122
+ for directory in directories_to_create:
123
+ try:
124
+ os.makedirs(directory, exist_ok=True)
125
+ except Exception as e:
126
+ print(f"Warning: Could not create directory {directory}: {e}")
127
+
128
  # Configure logging early to avoid issues
129
  logging.basicConfig(
130
  level=logging.INFO,
 
141
  AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
142
  AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  # Log startup information
145
  logging.info("=" * 50)
146
  logging.info("Docling Streamlit App Starting")
147
  logging.info(f"Temp directory: {TEMP_DIR}")
148
  logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}")
149
+ logging.info(f"Hugging Face cache: {os.environ.get('HF_CACHE_HOME', 'NOT_SET')}")
150
  logging.info(f"Current working directory: {os.getcwd()}")
151
  logging.info(f"Python version: {sys.version}")
152
  logging.info("=" * 50)