|
""" |
|
Configuration settings for Data Extractor Using Gemini |
|
Optimized for Gemini-only model usage with robust directory management |
|
""" |
|
|
|
import os |
|
from pathlib import Path |
|
from dotenv import load_dotenv |
|
import logging |
|
|
|
|
|
load_dotenv() |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class Settings: |
|
"""Configuration settings with Gemini-only model support and robust directory management.""" |
|
|
|
|
|
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") |
|
|
|
|
|
DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro") |
|
DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.5-pro") |
|
CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.5-flash") |
|
|
|
|
|
DATA_EXTRACTOR_MODEL_THINKING_BUDGET = int(os.getenv("DATA_EXTRACTOR_THINKING_BUDGET", "4096")) |
|
DATA_ARRANGER_MODEL_THINKING_BUDGET = int(os.getenv("DATA_ARRANGER_THINKING_BUDGET", "4096")) |
|
CODE_GENERATOR_MODEL_THINKING_BUDGET = int(os.getenv("CODE_GENERATOR_THINKING_BUDGET", "4096")) |
|
|
|
|
|
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50")) |
|
SUPPORTED_FILE_TYPES = [ |
|
"pdf", "txt", "docx", "xlsx", "csv", "md", "json", "xml", "html", |
|
"png", "jpg", "jpeg", "doc", "xls", "ppt", "pptx" |
|
] |
|
|
|
|
|
|
|
WORKING_DIR = Path(os.getenv("WORKING_DIR", "/tmp/data_extractor_gemini")) |
|
|
|
|
|
TEMP_DIR = WORKING_DIR / "temp" |
|
INPUT_DIR = WORKING_DIR / "input" |
|
OUTPUT_DIR = WORKING_DIR / "output" |
|
CACHE_DIR = WORKING_DIR / "cache" |
|
LOGS_DIR = WORKING_DIR / "logs" |
|
|
|
|
|
|
|
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3")) |
|
RETRY_DELAY_SECONDS = int(os.getenv("RETRY_DELAY_SECONDS", "5")) |
|
AGENT_TIMEOUT_SECONDS = int(os.getenv("AGENT_TIMEOUT_SECONDS", "300")) |
|
|
|
|
|
ENABLE_CACHING = os.getenv("ENABLE_CACHING", "true").lower() == "true" |
|
CACHE_TTL_HOURS = int(os.getenv("CACHE_TTL_HOURS", "24")) |
|
|
|
@classmethod |
|
def initialize_directories(cls): |
|
"""Initialize all required directories with proper permissions.""" |
|
directories = [ |
|
cls.WORKING_DIR, |
|
cls.TEMP_DIR, |
|
cls.INPUT_DIR, |
|
cls.OUTPUT_DIR, |
|
cls.CACHE_DIR, |
|
cls.LOGS_DIR |
|
] |
|
|
|
created_dirs = [] |
|
for directory in directories: |
|
try: |
|
directory.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
test_file = directory / ".write_test" |
|
test_file.write_text("test") |
|
test_file.unlink() |
|
|
|
created_dirs.append(str(directory)) |
|
logger.debug(f"Directory initialized: {directory}") |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to initialize directory {directory}: {e}") |
|
raise RuntimeError(f"Cannot create or write to directory {directory}: {e}") |
|
|
|
logger.info(f"Successfully initialized {len(created_dirs)} directories") |
|
return created_dirs |
|
|
|
@classmethod |
|
def validate_config(cls): |
|
"""Comprehensive configuration validation with detailed error reporting.""" |
|
errors = [] |
|
warnings = [] |
|
|
|
|
|
|
|
|
|
if not cls.GOOGLE_API_KEY: |
|
errors.append("GOOGLE_API_KEY is required. Get it from https://aistudio.google.com/app/apikey") |
|
elif len(cls.GOOGLE_API_KEY) < 30: |
|
warnings.append("GOOGLE_API_KEY appears to be too short - verify it's correct") |
|
|
|
|
|
gemini_models = [cls.DATA_EXTRACTOR_MODEL, cls.DATA_ARRANGER_MODEL, cls.CODE_GENERATOR_MODEL] |
|
for i, model in enumerate(gemini_models): |
|
model_names = ["DATA_EXTRACTOR_MODEL", "DATA_ARRANGER_MODEL", "CODE_GENERATOR_MODEL"] |
|
if not model: |
|
errors.append(f"{model_names[i]} cannot be empty") |
|
elif not model.startswith("gemini-"): |
|
errors.append(f"{model_names[i]} must be a Gemini model (starts with 'gemini-'), got: {model}") |
|
|
|
|
|
try: |
|
cls.initialize_directories() |
|
except Exception as e: |
|
errors.append(f"Directory initialization failed: {e}") |
|
|
|
|
|
|
|
|
|
if cls.MAX_FILE_SIZE_MB <= 0: |
|
errors.append("MAX_FILE_SIZE_MB must be positive") |
|
elif cls.MAX_FILE_SIZE_MB > 100: |
|
warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large - may cause memory issues") |
|
|
|
|
|
if not cls.SUPPORTED_FILE_TYPES: |
|
errors.append("SUPPORTED_FILE_TYPES cannot be empty") |
|
|
|
|
|
budgets = [ |
|
(cls.DATA_EXTRACTOR_MODEL_THINKING_BUDGET, "DATA_EXTRACTOR_MODEL_THINKING_BUDGET"), |
|
(cls.DATA_ARRANGER_MODEL_THINKING_BUDGET, "DATA_ARRANGER_MODEL_THINKING_BUDGET"), |
|
(cls.CODE_GENERATOR_MODEL_THINKING_BUDGET, "CODE_GENERATOR_MODEL_THINKING_BUDGET") |
|
] |
|
|
|
for budget, name in budgets: |
|
if budget < 1024: |
|
warnings.append(f"{name} ({budget}) is quite low - may affect model performance") |
|
elif budget > 8192: |
|
warnings.append(f"{name} ({budget}) is very high - may be unnecessary") |
|
|
|
|
|
if cls.MAX_RETRIES < 1: |
|
warnings.append("MAX_RETRIES should be at least 1") |
|
elif cls.MAX_RETRIES > 10: |
|
warnings.append("MAX_RETRIES is very high - may cause long delays") |
|
|
|
|
|
|
|
if errors: |
|
error_msg = "❌ Configuration validation failed:\n" |
|
error_msg += "\n".join(f" • {error}" for error in errors) |
|
|
|
if warnings: |
|
error_msg += "\n\n⚠️ Warnings:\n" |
|
error_msg += "\n".join(f" • {warning}" for warning in warnings) |
|
|
|
raise ValueError(error_msg) |
|
|
|
if warnings: |
|
logger.warning("Configuration warnings detected:") |
|
for warning in warnings: |
|
logger.warning(f" • {warning}") |
|
|
|
logger.info("✅ Configuration validation successful") |
|
return True |
|
|
|
@classmethod |
|
def get_session_directories(cls, session_id: str): |
|
"""Get session-specific directory structure.""" |
|
session_base = cls.WORKING_DIR / session_id |
|
|
|
return { |
|
"base": session_base, |
|
"input": session_base / "input", |
|
"output": session_base / "output", |
|
"temp": session_base / "temp", |
|
"cache": session_base / "cache" |
|
} |
|
|
|
@classmethod |
|
def create_session_directories(cls, session_id: str): |
|
"""Create and validate session-specific directories.""" |
|
session_dirs = cls.get_session_directories(session_id) |
|
|
|
created = [] |
|
for name, directory in session_dirs.items(): |
|
try: |
|
directory.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
test_file = directory / ".write_test" |
|
test_file.write_text("test") |
|
test_file.unlink() |
|
|
|
created.append(str(directory)) |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to create session directory {name}: {e}") |
|
raise RuntimeError(f"Cannot create session directory {directory}: {e}") |
|
|
|
logger.info(f"Created {len(created)} session directories for {session_id}") |
|
return session_dirs |
|
|
|
@classmethod |
|
def cleanup_session(cls, session_id: str, keep_output: bool = True): |
|
"""Clean up session directories with option to preserve output.""" |
|
session_dirs = cls.get_session_directories(session_id) |
|
|
|
import shutil |
|
cleaned = [] |
|
|
|
for name, directory in session_dirs.items(): |
|
if keep_output and name == "output": |
|
continue |
|
|
|
if directory.exists(): |
|
try: |
|
shutil.rmtree(directory) |
|
cleaned.append(str(directory)) |
|
except Exception as e: |
|
logger.warning(f"Could not clean {name} directory: {e}") |
|
|
|
logger.info(f"Cleaned {len(cleaned)} session directories for {session_id}") |
|
return cleaned |
|
|
|
@classmethod |
|
def get_debug_info(cls): |
|
"""Get comprehensive debug information about current configuration.""" |
|
import platform |
|
import sys |
|
|
|
return { |
|
"python_version": sys.version, |
|
"platform": platform.platform(), |
|
"temp_dir": str(cls.TEMP_DIR), |
|
"temp_dir_exists": cls.TEMP_DIR.exists(), |
|
"models": { |
|
"data_extractor": cls.DATA_EXTRACTOR_MODEL, |
|
"data_arranger": cls.DATA_ARRANGER_MODEL, |
|
"code_generator": cls.CODE_GENERATOR_MODEL, |
|
}, |
|
"api_keys": { |
|
"google_api_key_present": bool(cls.GOOGLE_API_KEY), |
|
"google_api_key_length": len(cls.GOOGLE_API_KEY) if cls.GOOGLE_API_KEY else 0 |
|
} |
|
} |
|
|
|
|
|
|
|
settings = Settings() |
|
|
|
|
|
try: |
|
settings.initialize_directories() |
|
logger.debug("Settings initialized successfully") |
|
except Exception as e: |
|
logger.error(f"Failed to initialize settings: {e}") |
|
|
|
|
|
|