methunraj
feat: Implement revenue data organization workflow with JSON output
8b21729
"""
Configuration settings for Data Extractor Using Gemini
Optimized for Gemini-only model usage with robust directory management
"""
import os
from pathlib import Path
from dotenv import load_dotenv
import logging
# Load environment variables
load_dotenv()
logger = logging.getLogger(__name__)
class Settings:
"""Configuration settings with Gemini-only model support and robust directory management."""
# === GEMINI MODEL CONFIGURATION ===
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
# Gemini model specifications - using gemini-2.5-flash (supports thinking budget)
DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro")
DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.5-pro")
CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.5-flash")
# Thinking budgets optimized for each task type
DATA_EXTRACTOR_MODEL_THINKING_BUDGET = int(os.getenv("DATA_EXTRACTOR_THINKING_BUDGET", "4096"))
DATA_ARRANGER_MODEL_THINKING_BUDGET = int(os.getenv("DATA_ARRANGER_THINKING_BUDGET", "4096"))
CODE_GENERATOR_MODEL_THINKING_BUDGET = int(os.getenv("CODE_GENERATOR_THINKING_BUDGET", "4096"))
# === FILE PROCESSING CONFIGURATION ===
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
SUPPORTED_FILE_TYPES = [
"pdf", "txt", "docx", "xlsx", "csv", "md", "json", "xml", "html",
"png", "jpg", "jpeg", "doc", "xls", "ppt", "pptx"
]
# === DIRECTORY MANAGEMENT ===
# Centralized working directory - all operations happen within this directory
WORKING_DIR = Path(os.getenv("WORKING_DIR", "/tmp/data_extractor_gemini"))
# Subdirectories within working directory
TEMP_DIR = WORKING_DIR / "temp"
INPUT_DIR = WORKING_DIR / "input"
OUTPUT_DIR = WORKING_DIR / "output"
CACHE_DIR = WORKING_DIR / "cache"
LOGS_DIR = WORKING_DIR / "logs"
# === WORKFLOW CONFIGURATION ===
# Retry and timeout settings
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))
RETRY_DELAY_SECONDS = int(os.getenv("RETRY_DELAY_SECONDS", "5"))
AGENT_TIMEOUT_SECONDS = int(os.getenv("AGENT_TIMEOUT_SECONDS", "300"))
# Cache settings
ENABLE_CACHING = os.getenv("ENABLE_CACHING", "true").lower() == "true"
CACHE_TTL_HOURS = int(os.getenv("CACHE_TTL_HOURS", "24"))
@classmethod
def initialize_directories(cls):
"""Initialize all required directories with proper permissions."""
directories = [
cls.WORKING_DIR,
cls.TEMP_DIR,
cls.INPUT_DIR,
cls.OUTPUT_DIR,
cls.CACHE_DIR,
cls.LOGS_DIR
]
created_dirs = []
for directory in directories:
try:
directory.mkdir(parents=True, exist_ok=True)
# Test write permissions
test_file = directory / ".write_test"
test_file.write_text("test")
test_file.unlink()
created_dirs.append(str(directory))
logger.debug(f"Directory initialized: {directory}")
except Exception as e:
logger.error(f"Failed to initialize directory {directory}: {e}")
raise RuntimeError(f"Cannot create or write to directory {directory}: {e}")
logger.info(f"Successfully initialized {len(created_dirs)} directories")
return created_dirs
@classmethod
def validate_config(cls):
"""Comprehensive configuration validation with detailed error reporting."""
errors = []
warnings = []
# === CRITICAL VALIDATIONS ===
# Google API Key validation
if not cls.GOOGLE_API_KEY:
errors.append("GOOGLE_API_KEY is required. Get it from https://aistudio.google.com/app/apikey")
elif len(cls.GOOGLE_API_KEY) < 30:
warnings.append("GOOGLE_API_KEY appears to be too short - verify it's correct")
# Model name validation
gemini_models = [cls.DATA_EXTRACTOR_MODEL, cls.DATA_ARRANGER_MODEL, cls.CODE_GENERATOR_MODEL]
for i, model in enumerate(gemini_models):
model_names = ["DATA_EXTRACTOR_MODEL", "DATA_ARRANGER_MODEL", "CODE_GENERATOR_MODEL"]
if not model:
errors.append(f"{model_names[i]} cannot be empty")
elif not model.startswith("gemini-"):
errors.append(f"{model_names[i]} must be a Gemini model (starts with 'gemini-'), got: {model}")
# Directory validation
try:
cls.initialize_directories()
except Exception as e:
errors.append(f"Directory initialization failed: {e}")
# === MODERATE VALIDATIONS ===
# File size validation
if cls.MAX_FILE_SIZE_MB <= 0:
errors.append("MAX_FILE_SIZE_MB must be positive")
elif cls.MAX_FILE_SIZE_MB > 100:
warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large - may cause memory issues")
# Supported file types validation
if not cls.SUPPORTED_FILE_TYPES:
errors.append("SUPPORTED_FILE_TYPES cannot be empty")
# Thinking budget validation
budgets = [
(cls.DATA_EXTRACTOR_MODEL_THINKING_BUDGET, "DATA_EXTRACTOR_MODEL_THINKING_BUDGET"),
(cls.DATA_ARRANGER_MODEL_THINKING_BUDGET, "DATA_ARRANGER_MODEL_THINKING_BUDGET"),
(cls.CODE_GENERATOR_MODEL_THINKING_BUDGET, "CODE_GENERATOR_MODEL_THINKING_BUDGET")
]
for budget, name in budgets:
if budget < 1024:
warnings.append(f"{name} ({budget}) is quite low - may affect model performance")
elif budget > 8192:
warnings.append(f"{name} ({budget}) is very high - may be unnecessary")
# Retry configuration validation
if cls.MAX_RETRIES < 1:
warnings.append("MAX_RETRIES should be at least 1")
elif cls.MAX_RETRIES > 10:
warnings.append("MAX_RETRIES is very high - may cause long delays")
# === RESULT PROCESSING ===
if errors:
error_msg = "❌ Configuration validation failed:\n"
error_msg += "\n".join(f" • {error}" for error in errors)
if warnings:
error_msg += "\n\n⚠️ Warnings:\n"
error_msg += "\n".join(f" • {warning}" for warning in warnings)
raise ValueError(error_msg)
if warnings:
logger.warning("Configuration warnings detected:")
for warning in warnings:
logger.warning(f" • {warning}")
logger.info("✅ Configuration validation successful")
return True
@classmethod
def get_session_directories(cls, session_id: str):
"""Get session-specific directory structure."""
session_base = cls.WORKING_DIR / session_id
return {
"base": session_base,
"input": session_base / "input",
"output": session_base / "output",
"temp": session_base / "temp",
"cache": session_base / "cache"
}
@classmethod
def create_session_directories(cls, session_id: str):
"""Create and validate session-specific directories."""
session_dirs = cls.get_session_directories(session_id)
created = []
for name, directory in session_dirs.items():
try:
directory.mkdir(parents=True, exist_ok=True)
# Test write permissions
test_file = directory / ".write_test"
test_file.write_text("test")
test_file.unlink()
created.append(str(directory))
except Exception as e:
logger.error(f"Failed to create session directory {name}: {e}")
raise RuntimeError(f"Cannot create session directory {directory}: {e}")
logger.info(f"Created {len(created)} session directories for {session_id}")
return session_dirs
@classmethod
def cleanup_session(cls, session_id: str, keep_output: bool = True):
"""Clean up session directories with option to preserve output."""
session_dirs = cls.get_session_directories(session_id)
import shutil
cleaned = []
for name, directory in session_dirs.items():
if keep_output and name == "output":
continue
if directory.exists():
try:
shutil.rmtree(directory)
cleaned.append(str(directory))
except Exception as e:
logger.warning(f"Could not clean {name} directory: {e}")
logger.info(f"Cleaned {len(cleaned)} session directories for {session_id}")
return cleaned
@classmethod
def get_debug_info(cls):
"""Get comprehensive debug information about current configuration."""
import platform
import sys
return {
"python_version": sys.version,
"platform": platform.platform(),
"temp_dir": str(cls.TEMP_DIR),
"temp_dir_exists": cls.TEMP_DIR.exists(),
"models": {
"data_extractor": cls.DATA_EXTRACTOR_MODEL,
"data_arranger": cls.DATA_ARRANGER_MODEL,
"code_generator": cls.CODE_GENERATOR_MODEL,
},
"api_keys": {
"google_api_key_present": bool(cls.GOOGLE_API_KEY),
"google_api_key_length": len(cls.GOOGLE_API_KEY) if cls.GOOGLE_API_KEY else 0
}
}
# Global settings instance
settings = Settings()
# Auto-initialize directories on import
try:
settings.initialize_directories()
logger.debug("Settings initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize settings: {e}")
# Don't raise here to allow import to succeed