from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import torch class ModelLoader: def __init__(self, model_name, hugging_face_token): self.model_name = model_name # Configure 4-bit quantization self.bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, llm_int8_enable_fp32_cpu_offload=True ) # Load tokenizer self.tokenizer = AutoTokenizer.from_pretrained( model_name, token=hugging_face_token ) # Load model with memory optimizations self.model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=self.bnb_config, device_map="auto", low_cpu_mem_usage=True, max_memory={ "cpu": "12GiB", "cuda:0": "4GiB", }, token=hugging_face_token )