|
import os |
|
from pathlib import Path |
|
from dotenv import load_dotenv |
|
|
|
load_dotenv() |
|
|
|
|
|
class Settings: |
|
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") |
|
MAX_FILE_SIZE_MB = 50 |
|
SUPPORTED_FILE_TYPES = [ |
|
"pdf", |
|
"txt", |
|
"png", |
|
"jpg", |
|
"jpeg", |
|
"docx", |
|
"xlsx", |
|
"csv", |
|
"md", |
|
"json", |
|
"xml", |
|
"html", |
|
"py", |
|
"js", |
|
"ts", |
|
"doc", |
|
"xls", |
|
"ppt", |
|
"pptx", |
|
] |
|
|
|
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/data_extractor_temp")) |
|
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "python:3.12-slim") |
|
COORDINATOR_MODEL = os.getenv("COORDINATOR_MODEL", "gemini-2.5-pro") |
|
PROMPT_ENGINEER_MODEL = os.getenv("PROMPT_ENGINEER_MODEL", "gemini-2.5-pro") |
|
DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro") |
|
DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.5-pro") |
|
CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.5-pro") |
|
|
|
COORDINATOR_MODEL_THINKING_BUDGET=2048 |
|
PROMPT_ENGINEER_MODEL_THINKING_BUDGET=2048 |
|
DATA_EXTRACTOR_MODEL_THINKING_BUDGET=-1 |
|
DATA_ARRANGER_MODEL_THINKING_BUDGET=3072 |
|
CODE_GENERATOR_MODEL_THINKING_BUDGET=3072 |
|
|
|
@classmethod |
|
def validate_config(cls): |
|
"""Validate configuration and create necessary directories.""" |
|
errors = [] |
|
warnings = [] |
|
|
|
|
|
if not cls.GOOGLE_API_KEY: |
|
errors.append("GOOGLE_API_KEY is required - get it from Google AI Studio") |
|
|
|
|
|
openai_key = os.getenv("OPENAI_API_KEY") |
|
if not openai_key: |
|
warnings.append("OPENAI_API_KEY not set - OpenAI models will not be available") |
|
|
|
|
|
try: |
|
cls.TEMP_DIR.mkdir(exist_ok=True, parents=True) |
|
|
|
test_file = cls.TEMP_DIR / ".write_test" |
|
try: |
|
test_file.write_text("test") |
|
test_file.unlink() |
|
except Exception as e: |
|
errors.append(f"Cannot write to temp directory {cls.TEMP_DIR}: {e}") |
|
except Exception as e: |
|
errors.append(f"Cannot create temp directory {cls.TEMP_DIR}: {e}") |
|
|
|
|
|
if cls.MAX_FILE_SIZE_MB <= 0: |
|
errors.append("MAX_FILE_SIZE_MB must be positive") |
|
elif cls.MAX_FILE_SIZE_MB > 100: |
|
warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large") |
|
|
|
|
|
if not cls.SUPPORTED_FILE_TYPES: |
|
errors.append("SUPPORTED_FILE_TYPES cannot be empty") |
|
|
|
|
|
model_fields = ['DATA_EXTRACTOR_MODEL', 'DATA_ARRANGER_MODEL', 'CODE_GENERATOR_MODEL'] |
|
for field in model_fields: |
|
model_name = getattr(cls, field) |
|
if not model_name: |
|
errors.append(f"{field} cannot be empty") |
|
elif not model_name.startswith(('gemini-', 'gpt-', 'claude-')): |
|
warnings.append(f"{field} '{model_name}' may not be a valid model name") |
|
|
|
|
|
if errors: |
|
error_msg = "Configuration validation failed:\n" + "\n".join(f"- {error}" for error in errors) |
|
if warnings: |
|
error_msg += "\n\nWarnings:\n" + "\n".join(f"- {warning}" for warning in warnings) |
|
raise ValueError(error_msg) |
|
|
|
if warnings: |
|
import logging |
|
logger = logging.getLogger(__name__) |
|
logger.warning("Configuration warnings:\n" + "\n".join(f"- {warning}" for warning in warnings)) |
|
|
|
return True |
|
|
|
@classmethod |
|
def get_debug_info(cls): |
|
"""Get debug information about current configuration.""" |
|
import platform |
|
import sys |
|
|
|
return { |
|
"python_version": sys.version, |
|
"platform": platform.platform(), |
|
"temp_dir": str(cls.TEMP_DIR), |
|
"temp_dir_exists": cls.TEMP_DIR.exists(), |
|
"supported_file_types": len(cls.SUPPORTED_FILE_TYPES), |
|
"max_file_size_mb": cls.MAX_FILE_SIZE_MB, |
|
"has_google_api_key": bool(cls.GOOGLE_API_KEY), |
|
"has_openai_api_key": bool(os.getenv("OPENAI_API_KEY")), |
|
"models": { |
|
"data_extractor": cls.DATA_EXTRACTOR_MODEL, |
|
"data_arranger": cls.DATA_ARRANGER_MODEL, |
|
"code_generator": cls.CODE_GENERATOR_MODEL |
|
} |
|
} |
|
|
|
|
|
settings = Settings() |
|
|