File size: 3,660 Bytes
59aaeae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b1b89b
 
 
 
94e74f0
 
 
9b1b89b
 
 
94e74f0
 
9b1b89b
 
 
59aaeae
 
94e74f0
a6b797a
59aaeae
 
 
 
94e74f0
59aaeae
 
 
 
94e74f0
59aaeae
 
94e74f0
59aaeae
c04ffe5
 
88d3e04
 
 
 
c04ffe5
 
88d3e04
59aaeae
 
75ead00
59aaeae
75ead00
 
 
59aaeae
75ead00
59aaeae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# config.py
"""
Configuration file for Mistral OCR processing.
Contains API key and other settings.
"""
import os
import logging
from dotenv import load_dotenv

# Configure logging
logger = logging.getLogger("config")

# Load environment variables from .env file if it exists
load_dotenv()

# Mistral API key handling - prioritizing Hugging Face environment
# Priority order:
# 1. HF_API_KEY environment variable (Hugging Face standard)
# 2. HUGGING_FACE_API_KEY environment variable (alternative name)
# 3. HF_MISTRAL_API_KEY environment variable (for Hugging Face deployment)
# 4. MISTRAL_API_KEY environment variable (fallback)
# 5. Empty string (will show warning in app)

MISTRAL_API_KEY = os.environ.get("HF_API_KEY", 
                  os.environ.get("HUGGING_FACE_API_KEY",
                  os.environ.get("HF_MISTRAL_API_KEY",
                  os.environ.get("MISTRAL_API_KEY", "")))).strip()

if not MISTRAL_API_KEY:
    logger.warning("No Mistral API key found in environment variables. API functionality will be limited.")

# Check if we're in test mode (allows operation without valid API key)
# Set to False to use actual API calls with Mistral API
TEST_MODE = False

# Model settings with fallbacks
OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest")  # Updated from ministral-8b-latest
VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest")  # faster model that supports vision

# Image preprocessing settings optimized for historical documents
# These can be customized from environment variables
IMAGE_PREPROCESSING = {
    "enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "3.5")),  # Increased contrast for better text recognition
    "sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
    "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
    "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "200.0")),    # Increased size limit for better quality
    "target_dpi": int(os.environ.get("TARGET_DPI", "300")),               # Target DPI for scaling
    "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "100")),  # Higher quality for better OCR results
    # # Enhanced settings for handwritten documents
    "handwritten": {
        "block_size": int(os.environ.get("HANDWRITTEN_BLOCK_SIZE", "21")), # Larger block size for adaptive thresholding
        "constant": int(os.environ.get("HANDWRITTEN_CONSTANT", "5")),      # Lower constant for adaptive thresholding
        "use_dilation": os.environ.get("HANDWRITTEN_DILATION", "True").lower() in ("true", "1", "yes"),  # Connect broken strokes
        "dilation_iterations": int(os.environ.get("HANDWRITTEN_DILATION_ITERATIONS", "2")),  # More iterations for better stroke connection
        "dilation_kernel_size": int(os.environ.get("HANDWRITTEN_DILATION_KERNEL_SIZE", "3"))       # Larger kernel for dilation
    }
}

# OCR settings optimized for single-page performance
OCR_SETTINGS = {
    "timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "45000")),         # Shorter timeout for single pages (45 seconds)
    "max_retries": int(os.environ.get("OCR_MAX_RETRIES", "2")),           # Fewer retries to avoid rate-limiting
    "retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "1")),           # Shorter initial retry delay for faster execution
    "include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
    "thread_count": int(os.environ.get("OCR_THREAD_COUNT", "2"))          # Lower thread count to prevent API rate limiting
}