Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

a69e2f2

verified ·

1 Parent(s): 6704e73

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +42 -46

run_cloud_training.py CHANGED Viewed

@@ -403,7 +403,8 @@ def remove_training_marker():
 def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False):
     """
-    Load the model with appropriate attention settings based on hardware capability
     """
     logger.info(f"Loading model: {model_name}")
@@ -421,51 +422,39 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
     )
     # Force eager implementation to avoid BMGHK format issues
-    attn_implementation = "eager"  # Use eager implementation to avoid BMGHK format issues
     logger.info(f"Forcing eager attention implementation to avoid BMGHK format issues")
-    # Try loading with unsloth
-    try:
-        logger.info("Loading model with unsloth optimizations")
-        model, tokenizer = FastLanguageModel.from_pretrained(
-            model_name=model_name,
-            max_seq_length=max_seq_length,
-            dtype=dtype,
-            quantization_config=bnb_config,
-            attn_implementation=attn_implementation
-        )
-        logger.info("Model loaded successfully with unsloth")
-        # Explicitly set attention implementation in model config
-        if hasattr(model, 'config'):
-            model.config.attn_implementation = attn_implementation
-            logger.info(f"Explicitly set model config attention implementation to {attn_implementation}")
-        return model, tokenizer
-    except Exception as e:
-        logger.warning(f"Unsloth loading failed: {e}")
-        logger.info("Falling back to standard Hugging Face loading...")
-        # Fallback to standard HF loading
-        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-        # Set attention implementation in config
-        config.attn_implementation = attn_implementation
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            config=config,
-            device_map="auto",
-            torch_dtype=dtype or torch.float16,
-            quantization_config=bnb_config,
-            trust_remote_code=True,
-            attn_implementation=attn_implementation
-        )
-        logger.info("Model loaded successfully with standard HF loading")
-        return model, tokenizer
 def train(config_path, dataset_name, output_dir):
     """Main training function - RESEARCH TRAINING PHASE ONLY"""
@@ -536,7 +525,10 @@ def train(config_path, dataset_name, output_dir):
         # Initialize model
         logger.info("Initializing model (preserving 4-bit quantization)")
-        max_seq_length = training_config.get("max_seq_length", 2048)
         # Create LoRA config directly
         logger.info("Creating LoRA configuration")
@@ -593,10 +585,14 @@ def train(config_path, dataset_name, output_dir):
             logger.warning("No reporting backends available - training metrics won't be logged")
         # Set up training arguments with correct parameters
         training_args_dict = {
             "output_dir": output_dir,
             "num_train_epochs": training_config.get("num_train_epochs", 3),
-            "per_device_train_batch_size": training_config.get("per_device_train_batch_size", 2),
             "gradient_accumulation_steps": training_config.get("gradient_accumulation_steps", 4),
             "learning_rate": training_config.get("learning_rate", 2e-5),
             "lr_scheduler_type": training_config.get("lr_scheduler_type", "cosine"),

 def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False):
     """
+    Load the model directly with HuggingFace, bypassing Unsloth optimizations
+    to avoid memory-efficient attention issues
     """
     logger.info(f"Loading model: {model_name}")
     )
     # Force eager implementation to avoid BMGHK format issues
+    attn_implementation = "eager"
     logger.info(f"Forcing eager attention implementation to avoid BMGHK format issues")
+    # Skip Unsloth and use standard HuggingFace loading
+    logger.info("Bypassing Unsloth optimizations to avoid memory-efficient attention issues")
+    # Load with standard HuggingFace
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    # Set attention implementation in config
+    config.attn_implementation = attn_implementation
+    # Disable any custom attention mechanisms
+    if hasattr(config, "use_flash_attention"):
+        config.use_flash_attention = False
+    if hasattr(config, "use_memory_efficient_attention"):
+        config.use_memory_efficient_attention = False
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        config=config,
+        device_map="auto",
+        torch_dtype=dtype or torch.float16,
+        quantization_config=bnb_config,
+        trust_remote_code=True,
+        attn_implementation=attn_implementation,
+        use_flash_attention=False,
+        use_memory_efficient_attention=False
+    )
+    logger.info("Model loaded successfully with standard HF loading")
+    return model, tokenizer
 def train(config_path, dataset_name, output_dir):
     """Main training function - RESEARCH TRAINING PHASE ONLY"""
         # Initialize model
         logger.info("Initializing model (preserving 4-bit quantization)")
+        # Reduce max sequence length to avoid memory issues
+        max_seq_length = min(training_config.get("max_seq_length", 2048), 1024)
+        logger.info(f"Using reduced max sequence length: {max_seq_length} to avoid memory issues")
         # Create LoRA config directly
         logger.info("Creating LoRA configuration")
             logger.warning("No reporting backends available - training metrics won't be logged")
         # Set up training arguments with correct parameters
+        # REDUCE BATCH SIZE to avoid memory issues with attention
+        per_device_train_batch_size = 1  # Reduced from default of 2
+        logger.info(f"Using reduced batch size: {per_device_train_batch_size} to avoid memory issues")
         training_args_dict = {
             "output_dir": output_dir,
             "num_train_epochs": training_config.get("num_train_epochs", 3),
+            "per_device_train_batch_size": per_device_train_batch_size,
             "gradient_accumulation_steps": training_config.get("gradient_accumulation_steps", 4),
             "learning_rate": training_config.get("learning_rate", 2e-5),
             "lr_scheduler_type": training_config.get("lr_scheduler_type", "cosine"),