Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

6704e73

verified ·

1 Parent(s): 65829fc

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +54 -13

run_cloud_training.py CHANGED Viewed

@@ -407,6 +407,10 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
     """
     logger.info(f"Loading model: {model_name}")
     # Create BitsAndBytesConfig for 4-bit quantization
     from transformers import BitsAndBytesConfig
     bnb_config = BitsAndBytesConfig(
@@ -416,14 +420,9 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
         bnb_4bit_use_double_quant=True
     )
-    # Determine appropriate attention implementation
-    attn_implementation = "sdpa"  # Default to PyTorch's scaled dot product attention
-    if use_flash_attention and flash_attention_available:
-        logger.info("Using Flash Attention for faster training")
-        attn_implementation = "flash_attention_2"
-    else:
-        logger.info("Using standard attention mechanism (sdpa)")
     # Try loading with unsloth
     try:
@@ -436,6 +435,12 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
             attn_implementation=attn_implementation
         )
         logger.info("Model loaded successfully with unsloth")
         return model, tokenizer
     except Exception as e:
@@ -444,6 +449,10 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
         # Fallback to standard HF loading
         config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
@@ -464,6 +473,32 @@ def train(config_path, dataset_name, output_dir):
     load_dotenv()
     config = load_config(config_path)
     # Extract configs
     model_config = config.get("model_config", {})
     training_config = config.get("training_config", {})
@@ -513,11 +548,11 @@ def train(config_path, dataset_name, output_dir):
             target_modules=lora_config.get("target_modules", ["q_proj", "k_proj", "v_proj", "o_proj"])
         )
-        # Determine if we should use flash attention
-        use_flash_attention = hardware_config.get("use_flash_attention", False)
         # Initialize model with our safe loading function
-        logger.info("Loading pre-quantized model")
         dtype = torch.float16 if hardware_config.get("fp16", True) else None
         model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention)
@@ -531,7 +566,10 @@ def train(config_path, dataset_name, output_dir):
         from peft import get_peft_model
         model = get_peft_model(model, lora_config_obj)
         logger.info("Successfully applied LoRA with standard PEFT")
         # No need to format the dataset - it's already pre-tokenized
         logger.info("Using dataset with flexible tokenization handling")
         logger.info("Will use pre-tokenized data if available, or tokenize strings as fallback")
@@ -627,10 +665,13 @@ if __name__ == "__main__":
     parser.add_argument("--output_dir", type=str, default=None,
                         help="Output directory for the fine-tuned model")
     parser.add_argument("--use_flash_attention", action="store_true",
-                        help="Use Flash Attention if available")
     args = parser.parse_args()
     # Run training - Research phase only
     try:
         output_path = train(args.config, args.dataset, args.output_dir)

     """
     logger.info(f"Loading model: {model_name}")
+    # Explicitly disable xformers and flash attention in environment
+    os.environ["XFORMERS_DISABLED"] = "1"
+    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
     # Create BitsAndBytesConfig for 4-bit quantization
     from transformers import BitsAndBytesConfig
     bnb_config = BitsAndBytesConfig(
         bnb_4bit_use_double_quant=True
     )
+    # Force eager implementation to avoid BMGHK format issues
+    attn_implementation = "eager"  # Use eager implementation to avoid BMGHK format issues
+    logger.info(f"Forcing eager attention implementation to avoid BMGHK format issues")
     # Try loading with unsloth
     try:
             attn_implementation=attn_implementation
         )
         logger.info("Model loaded successfully with unsloth")
+        # Explicitly set attention implementation in model config
+        if hasattr(model, 'config'):
+            model.config.attn_implementation = attn_implementation
+            logger.info(f"Explicitly set model config attention implementation to {attn_implementation}")
         return model, tokenizer
     except Exception as e:
         # Fallback to standard HF loading
         config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        # Set attention implementation in config
+        config.attn_implementation = attn_implementation
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
     load_dotenv()
     config = load_config(config_path)
+    # Explicitly disable xformers and flash attention in environment
+    os.environ["XFORMERS_DISABLED"] = "1"
+    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+    # Try to unload xformers if it's loaded
+    if 'xformers' in sys.modules:
+        logger.info("Removing xformers from sys.modules")
+        del sys.modules['xformers']
+    # Patch torch.nn.functional to avoid memory_efficient_attention
+    try:
+        import torch.nn.functional as F
+        if hasattr(F, 'scaled_dot_product_attention'):
+            logger.info("Patching torch.nn.functional.scaled_dot_product_attention")
+            original_sdpa = F.scaled_dot_product_attention
+            def safe_sdpa(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None):
+                # Force disable memory efficient attention
+                logger.info("Using safe scaled_dot_product_attention (no xformers)")
+                return original_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale)
+            F.scaled_dot_product_attention = safe_sdpa
+    except Exception as e:
+        logger.warning(f"Failed to patch scaled_dot_product_attention: {e}")
     # Extract configs
     model_config = config.get("model_config", {})
     training_config = config.get("training_config", {})
             target_modules=lora_config.get("target_modules", ["q_proj", "k_proj", "v_proj", "o_proj"])
         )
+        # Force eager attention implementation
+        use_flash_attention = False  # Override to force eager implementation
         # Initialize model with our safe loading function
+        logger.info("Loading pre-quantized model with eager attention")
         dtype = torch.float16 if hardware_config.get("fp16", True) else None
         model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention)
         from peft import get_peft_model
         model = get_peft_model(model, lora_config_obj)
         logger.info("Successfully applied LoRA with standard PEFT")
+        # Explicitly set attention implementation in model config again after PEFT
+        model.config.attn_implementation = "eager"
         # No need to format the dataset - it's already pre-tokenized
         logger.info("Using dataset with flexible tokenization handling")
         logger.info("Will use pre-tokenized data if available, or tokenize strings as fallback")
     parser.add_argument("--output_dir", type=str, default=None,
                         help="Output directory for the fine-tuned model")
     parser.add_argument("--use_flash_attention", action="store_true",
+                        help="Use Flash Attention if available (NOT RECOMMENDED)")
     args = parser.parse_args()
+    # Override flash attention setting to force eager implementation
+    args.use_flash_attention = False
     # Run training - Research phase only
     try:
         output_path = train(args.config, args.dataset, args.output_dir)