File size: 4,879 Bytes
cfeb3a6
 
 
 
 
 
 
 
90b0a17
0daea93
cfeb3a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0daea93
 
cfeb3a6
 
 
 
 
 
 
 
 
90b0a17
 
 
 
 
17e3d1d
90b0a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cfeb3a6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()


class Settings:
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    MAX_FILE_SIZE_MB = 50
    SUPPORTED_FILE_TYPES = [
        "pdf",
        "txt",
        "png",
        "jpg",
        "jpeg",
        "docx",
        "xlsx",
        "csv",
        "md",
        "json",
        "xml",
        "html",
        "py",
        "js",
        "ts",
        "doc",
        "xls",
        "ppt",
        "pptx",
    ]
    # Use /tmp for temporary files on Hugging Face Spaces (or override with TEMP_DIR env var)
    TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/data_extractor_temp"))
    DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "python:3.12-slim")
    COORDINATOR_MODEL = os.getenv("COORDINATOR_MODEL", "gemini-2.5-pro")
    PROMPT_ENGINEER_MODEL = os.getenv("PROMPT_ENGINEER_MODEL", "gemini-2.5-pro")
    DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro")
    DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.0-flash")
    CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.0-flash")

    COORDINATOR_MODEL_THINKING_BUDGET=2048
    PROMPT_ENGINEER_MODEL_THINKING_BUDGET=2048
    DATA_EXTRACTOR_MODEL_THINKING_BUDGET=-1
    DATA_ARRANGER_MODEL_THINKING_BUDGET=3072
    CODE_GENERATOR_MODEL_THINKING_BUDGET=3072

    @classmethod
    def validate_config(cls):
        """Validate configuration and create necessary directories."""
        errors = []
        warnings = []
        
        # Check required API keys
        if not cls.GOOGLE_API_KEY:
            errors.append("GOOGLE_API_KEY is required - get it from Google AI Studio")
            
        # Check for optional but recommended API keys
        openai_key = os.getenv("OPENAI_API_KEY")
        if not openai_key:
            warnings.append("OPENAI_API_KEY not set - OpenAI models will not be available")
            
        # Validate and create temp directory
        try:
            cls.TEMP_DIR.mkdir(exist_ok=True, parents=True)
            # Test write permissions
            test_file = cls.TEMP_DIR / ".write_test"
            try:
                test_file.write_text("test")
                test_file.unlink()
            except Exception as e:
                errors.append(f"Cannot write to temp directory {cls.TEMP_DIR}: {e}")
        except Exception as e:
            errors.append(f"Cannot create temp directory {cls.TEMP_DIR}: {e}")
            
        # Validate file size limits
        if cls.MAX_FILE_SIZE_MB <= 0:
            errors.append("MAX_FILE_SIZE_MB must be positive")
        elif cls.MAX_FILE_SIZE_MB > 100:
            warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large")
            
        # Validate supported file types
        if not cls.SUPPORTED_FILE_TYPES:
            errors.append("SUPPORTED_FILE_TYPES cannot be empty")
            
        # Validate model names
        model_fields = ['DATA_EXTRACTOR_MODEL', 'DATA_ARRANGER_MODEL', 'CODE_GENERATOR_MODEL']
        for field in model_fields:
            model_name = getattr(cls, field)
            if not model_name:
                errors.append(f"{field} cannot be empty")
            elif not model_name.startswith(('gemini-', 'gpt-', 'claude-')):
                warnings.append(f"{field} '{model_name}' may not be a valid model name")
                
        # Return validation results
        if errors:
            error_msg = "Configuration validation failed:\n" + "\n".join(f"- {error}" for error in errors)
            if warnings:
                error_msg += "\n\nWarnings:\n" + "\n".join(f"- {warning}" for warning in warnings)
            raise ValueError(error_msg)
            
        if warnings:
            import logging
            logger = logging.getLogger(__name__)
            logger.warning("Configuration warnings:\n" + "\n".join(f"- {warning}" for warning in warnings))
            
        return True
    
    @classmethod
    def get_debug_info(cls):
        """Get debug information about current configuration."""
        import platform
        import sys
        
        return {
            "python_version": sys.version,
            "platform": platform.platform(),
            "temp_dir": str(cls.TEMP_DIR),
            "temp_dir_exists": cls.TEMP_DIR.exists(),
            "supported_file_types": len(cls.SUPPORTED_FILE_TYPES),
            "max_file_size_mb": cls.MAX_FILE_SIZE_MB,
            "has_google_api_key": bool(cls.GOOGLE_API_KEY),
            "has_openai_api_key": bool(os.getenv("OPENAI_API_KEY")),
            "models": {
                "data_extractor": cls.DATA_EXTRACTOR_MODEL,
                "data_arranger": cls.DATA_ARRANGER_MODEL, 
                "code_generator": cls.CODE_GENERATOR_MODEL
            }
        }


settings = Settings()