# model.py - Optimized version from transformers import AutoTokenizer, AutoModelForCausalLM import torch from functools import lru_cache import os import asyncio from concurrent.futures import ThreadPoolExecutor import logging logger = logging.getLogger(__name__) # Global variables to store loaded model _tokenizer = None _model = None _model_loading = False _model_loaded = False @lru_cache(maxsize=1) def get_model_config(): """Cache model configuration""" return { "model_id": "deepseek-ai/deepseek-coder-1.3b-instruct", "torch_dtype": torch.bfloat16, "device_map": "auto", "trust_remote_code": True, # Add these optimizations "low_cpu_mem_usage": True, "use_cache": True, } def load_model_sync(): """Synchronous model loading with optimizations""" global _tokenizer, _model, _model_loaded if _model_loaded: return _tokenizer, _model config = get_model_config() model_id = config["model_id"] logger.info(f"🔧 Loading model {model_id}...") try: # Set cache directory to avoid re-downloading cache_dir = os.environ.get("TRANSFORMERS_CACHE", "./model_cache") os.makedirs(cache_dir, exist_ok=True) # Load tokenizer first (faster) logger.info("📝 Loading tokenizer...") _tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=config["trust_remote_code"], cache_dir=cache_dir, use_fast=True, # Use fast tokenizer if available ) # Load model with optimizations logger.info("🧠 Loading model...") _model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=config["trust_remote_code"], torch_dtype=config["torch_dtype"], device_map=config["device_map"], low_cpu_mem_usage=config["low_cpu_mem_usage"], cache_dir=cache_dir, offload_folder="offload", offload_state_dict=True ) # Set to evaluation mode _model.eval() _model_loaded = True logger.info("✅ Model loaded successfully!") return _tokenizer, _model except Exception as e: logger.error(f"❌ Failed to load model: {e}") raise async def load_model_async(): """Asynchronous model loading""" global _model_loading if _model_loaded: return _tokenizer, _model if _model_loading: # Wait for ongoing loading to complete while _model_loading and not _model_loaded: await asyncio.sleep(0.1) return _tokenizer, _model _model_loading = True try: # Run model loading in thread pool to avoid blocking loop = asyncio.get_event_loop() with ThreadPoolExecutor(max_workers=1) as executor: tokenizer, model = await loop.run_in_executor( executor, load_model_sync ) return tokenizer, model finally: _model_loading = False def get_model(): """Get the loaded model (for synchronous access)""" if not _model_loaded: return load_model_sync() return _tokenizer, _model def is_model_loaded(): """Check if model is loaded""" return _model_loaded def get_model_info(): """Get model information without loading""" config = get_model_config() return { "model_id": config["model_id"], "loaded": _model_loaded, "loading": _model_loading, }