Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

65829fc

verified ·

1 Parent(s): 4d8bc74

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +57 -193

run_cloud_training.py CHANGED Viewed

@@ -401,147 +401,62 @@ def remove_training_marker():
         os.remove("TRAINING_ACTIVE")
         logger.info("Removed training active marker")
-def load_model_safely(model_name, max_seq_length, dtype=None):
     """
-    Load the model in a safe way that works with Qwen models
-    by trying different loading strategies.
     """
-    global flash_attention_available
-    # Force disable flash attention and xformers
-    flash_attention_available = False
-    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
-    os.environ["XFORMERS_DISABLED"] = "1"
-    # Patch transformers attention implementation
-    try:
-        # Try to patch transformers attention implementation to avoid xformers
-        import transformers.models.llama.modeling_llama as llama_modeling
-        # Store original attention implementation
-        if not hasattr(llama_modeling, '_original_forward'):
-            # Only patch if not already patched
-            logger.info("Patching LLaMA attention implementation to avoid xformers")
-            # Store original implementation
-            if hasattr(llama_modeling.LlamaAttention, 'forward'):
-                llama_modeling._original_forward = llama_modeling.LlamaAttention.forward
-                # Define a new forward method that doesn't use xformers
-                def safe_attention_forward(self, hidden_states, attention_mask=None, position_ids=None, past_key_value=None, output_attentions=False, use_cache=False):
-                    logger.info("Using safe attention implementation (no xformers)")
-                    # Force use_flash_attention to False
-                    self._attn_implementation = "eager"
-                    if hasattr(self, 'use_flash_attention'):
-                        self.use_flash_attention = False
-                    if hasattr(self, 'use_flash_attention_2'):
-                        self.use_flash_attention_2 = False
-                    # Call original implementation with flash attention disabled
-                    return llama_modeling._original_forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache)
-                # Replace the forward method
-                llama_modeling.LlamaAttention.forward = safe_attention_forward
-                logger.info("Successfully patched LLaMA attention implementation")
-    except Exception as e:
-        logger.warning(f"Failed to patch attention implementation: {e}")
-        logger.info("Will try to proceed with standard loading")
     try:
-        logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
-        # Create BitsAndBytesConfig for 4-bit quantization
-        from transformers import BitsAndBytesConfig
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True
         )
-        # First try loading with unsloth but without flash attention
-        try:
-            logger.info("Loading model with unsloth optimizations")
-            # Don't pass any flash attention parameters to unsloth
-            model, tokenizer = FastLanguageModel.from_pretrained(
-                model_name=model_name,
-                max_seq_length=max_seq_length,
-                dtype=dtype,
-                quantization_config=bnb_config,
-                attn_implementation="eager"  # Force eager attention
-            )
-            logger.info("Model loaded successfully with unsloth")
-            # Explicitly disable flash attention in model config
-            if hasattr(model, 'config'):
-                if hasattr(model.config, 'attn_implementation'):
-                    model.config.attn_implementation = "eager"
-            return model, tokenizer
-        except Exception as e:
-            logger.warning(f"Unsloth loading failed: {e}")
-            logger.info("Falling back to standard Hugging Face loading...")
-            # We'll try with HF loading
-            attn_params = {
-                "attn_implementation": "eager"  # Always use eager
-            }
-            # Approach 1: Using attn_implementation parameter (newer method)
-            try:
-                logger.info(f"Trying HF loading with attention parameters: {attn_params}")
-                config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-                # Disable flash attention in config
-                if hasattr(config, 'attn_implementation'):
-                    config.attn_implementation = "eager"
-                tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                # The proper way to set attention implementation in newer transformers
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    config=config,
-                    device_map="auto",
-                    torch_dtype=dtype or torch.float16,
-                    quantization_config=bnb_config,
-                    trust_remote_code=True,
-                    **attn_params
-                )
-                logger.info(f"Model loaded successfully with HF using attention parameters: {attn_params}")
-                return model, tokenizer
-            except Exception as e:
-                logger.warning(f"HF loading with attn_implementation failed: {e}")
-                logger.info("Trying fallback method...")
-                # Approach 2: Complete fallback with minimal parameters
-                config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-                # Disable flash attention in config
-                if hasattr(config, 'attn_implementation'):
-                    config.attn_implementation = "eager"
-                tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                # Most basic loading without any attention parameters
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    config=config,
-                    device_map="auto",
-                    torch_dtype=dtype or torch.float16,
-                    quantization_config=bnb_config,
-                    trust_remote_code=True,
-                    attn_implementation="eager"
-                )
-                logger.info("Model loaded successfully with basic HF loading")
-                return model, tokenizer
     except Exception as e:
-        logger.error(f"All model loading attempts failed: {e}")
-        raise
 def train(config_path, dataset_name, output_dir):
     """Main training function - RESEARCH TRAINING PHASE ONLY"""
@@ -556,50 +471,6 @@ def train(config_path, dataset_name, output_dir):
     lora_config = config.get("lora_config", {})
     dataset_config = config.get("dataset_config", {})
-    # Force disable flash attention and xformers
-    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
-    os.environ["XFORMERS_DISABLED"] = "1"
-    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-    # Monkey patch torch.nn.functional to disable memory_efficient_attention
-    try:
-        import torch.nn.functional as F
-        if hasattr(F, 'scaled_dot_product_attention'):
-            logger.info("Monkey patching torch.nn.functional.scaled_dot_product_attention")
-            original_sdpa = F.scaled_dot_product_attention
-            def safe_sdpa(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None):
-                # Force disable memory efficient attention
-                logger.info("Using safe scaled_dot_product_attention (no xformers)")
-                return original_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale)
-            F.scaled_dot_product_attention = safe_sdpa
-    except Exception as e:
-        logger.warning(f"Failed to patch scaled_dot_product_attention: {e}")
-    # Completely remove xformers from sys.modules if it's loaded
-    for module_name in list(sys.modules.keys()):
-        if 'xformers' in module_name:
-            logger.info(f"Removing {module_name} from sys.modules")
-            del sys.modules[module_name]
-    # Update flash attention setting to always use eager
-    global flash_attention_available
-    flash_attention_available = False
-    logger.info("Flash Attention has been DISABLED globally")
-    # Update hardware config to ensure eager attention
-    hardware_config["attn_implementation"] = "eager"
-    # Verify this is training phase only
-    training_phase_only = dataset_config.get("training_phase_only", True)
-    if not training_phase_only:
-        logger.warning("This script is meant for research training phase only")
-        logger.warning("Setting training_phase_only=True")
-    # Verify dataset is pre-tokenized
-    logger.info("IMPORTANT: Using pre-tokenized dataset - No tokenization will be performed")
     # Set the output directory
     output_dir = output_dir or training_config.get("output_dir", "fine_tuned_model")
     os.makedirs(output_dir, exist_ok=True)
@@ -628,8 +499,8 @@ def train(config_path, dataset_name, output_dir):
         )
         tokenizer.pad_token = tokenizer.eos_token
-        # Initialize model with unsloth
-        logger.info("Initializing model with unsloth (preserving 4-bit quantization)")
         max_seq_length = training_config.get("max_seq_length", 2048)
         # Create LoRA config directly
@@ -642,29 +513,21 @@ def train(config_path, dataset_name, output_dir):
             target_modules=lora_config.get("target_modules", ["q_proj", "k_proj", "v_proj", "o_proj"])
         )
         # Initialize model with our safe loading function
-        logger.info("Loading pre-quantized model safely")
         dtype = torch.float16 if hardware_config.get("fp16", True) else None
-        # Force eager attention implementation
-        os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
-        logger.info("Flash attention has been DISABLED globally via environment variable")
-        # Update hardware config to ensure eager attention
-        hardware_config["attn_implementation"] = "eager"
-        model, tokenizer = load_model_safely(model_name, max_seq_length, dtype)
         # Disable generation capabilities for research training
         logger.info("Disabling generation capabilities - Research training only")
         model.config.is_decoder = False
         model.config.task_specific_params = None
-        # Try different approaches to apply LoRA
         logger.info("Applying LoRA to model")
-        # Skip unsloth's method and go directly to PEFT
-        logger.info("Using standard PEFT method to apply LoRA")
         from peft import get_peft_model
         model = get_peft_model(model, lora_config_obj)
         logger.info("Successfully applied LoRA with standard PEFT")
@@ -692,7 +555,6 @@ def train(config_path, dataset_name, output_dir):
             logger.warning("No reporting backends available - training metrics won't be logged")
         # Set up training arguments with correct parameters
-        # Extract only the valid parameters from hardware_config
         training_args_dict = {
             "output_dir": output_dir,
             "num_train_epochs": training_config.get("num_train_epochs", 3),
@@ -764,6 +626,8 @@ if __name__ == "__main__":
                         help="Dataset name or path")
     parser.add_argument("--output_dir", type=str, default=None,
                         help="Output directory for the fine-tuned model")
     args = parser.parse_args()

         os.remove("TRAINING_ACTIVE")
         logger.info("Removed training active marker")
+def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False):
     """
+    Load the model with appropriate attention settings based on hardware capability
     """
+    logger.info(f"Loading model: {model_name}")
+    # Create BitsAndBytesConfig for 4-bit quantization
+    from transformers import BitsAndBytesConfig
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True
+    )
+    # Determine appropriate attention implementation
+    attn_implementation = "sdpa"  # Default to PyTorch's scaled dot product attention
+    if use_flash_attention and flash_attention_available:
+        logger.info("Using Flash Attention for faster training")
+        attn_implementation = "flash_attention_2"
+    else:
+        logger.info("Using standard attention mechanism (sdpa)")
+    # Try loading with unsloth
     try:
+        logger.info("Loading model with unsloth optimizations")
+        model, tokenizer = FastLanguageModel.from_pretrained(
+            model_name=model_name,
+            max_seq_length=max_seq_length,
+            dtype=dtype,
+            quantization_config=bnb_config,
+            attn_implementation=attn_implementation
         )
+        logger.info("Model loaded successfully with unsloth")
+        return model, tokenizer
     except Exception as e:
+        logger.warning(f"Unsloth loading failed: {e}")
+        logger.info("Falling back to standard Hugging Face loading...")
+        # Fallback to standard HF loading
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            config=config,
+            device_map="auto",
+            torch_dtype=dtype or torch.float16,
+            quantization_config=bnb_config,
+            trust_remote_code=True,
+            attn_implementation=attn_implementation
+        )
+        logger.info("Model loaded successfully with standard HF loading")
+        return model, tokenizer
 def train(config_path, dataset_name, output_dir):
     """Main training function - RESEARCH TRAINING PHASE ONLY"""
     lora_config = config.get("lora_config", {})
     dataset_config = config.get("dataset_config", {})
     # Set the output directory
     output_dir = output_dir or training_config.get("output_dir", "fine_tuned_model")
     os.makedirs(output_dir, exist_ok=True)
         )
         tokenizer.pad_token = tokenizer.eos_token
+        # Initialize model
+        logger.info("Initializing model (preserving 4-bit quantization)")
         max_seq_length = training_config.get("max_seq_length", 2048)
         # Create LoRA config directly
             target_modules=lora_config.get("target_modules", ["q_proj", "k_proj", "v_proj", "o_proj"])
         )
+        # Determine if we should use flash attention
+        use_flash_attention = hardware_config.get("use_flash_attention", False)
         # Initialize model with our safe loading function
+        logger.info("Loading pre-quantized model")
         dtype = torch.float16 if hardware_config.get("fp16", True) else None
+        model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention)
         # Disable generation capabilities for research training
         logger.info("Disabling generation capabilities - Research training only")
         model.config.is_decoder = False
         model.config.task_specific_params = None
+        # Apply LoRA to model
         logger.info("Applying LoRA to model")
         from peft import get_peft_model
         model = get_peft_model(model, lora_config_obj)
         logger.info("Successfully applied LoRA with standard PEFT")
             logger.warning("No reporting backends available - training metrics won't be logged")
         # Set up training arguments with correct parameters
         training_args_dict = {
             "output_dir": output_dir,
             "num_train_epochs": training_config.get("num_train_epochs", 3),
                         help="Dataset name or path")
     parser.add_argument("--output_dir", type=str, default=None,
                         help="Output directory for the fine-tuned model")
+    parser.add_argument("--use_flash_attention", action="store_true",
+                        help="Use Flash Attention if available")
     args = parser.parse_args()