Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

69ba4cd

verified ·

1 Parent(s): 6bbd6b2

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +45 -7

run_cloud_training.py CHANGED Viewed

@@ -56,6 +56,16 @@ except Exception as e:
     logger.warning(f"Failed to install flash-attention: {e}")
     logger.info("Continuing without flash-attention")
 # Check if tensorboard is available
 try:
     import tensorboard
@@ -298,6 +308,8 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
     Load the model in a safe way that works with Qwen models
     by trying different loading strategies.
     """
     try:
         logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
@@ -328,14 +340,30 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
             logger.info("Falling back to standard Hugging Face loading...")
             # We'll try two approaches with HF loading
             # Approach 1: Using attn_implementation parameter (newer method)
             try:
-                logger.info("Trying HF loading with attn_implementation parameter")
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                # The proper way to disable flash attention in newer transformers
                 model = AutoModelForCausalLM.from_pretrained(
                     model_name,
                     config=config,
@@ -343,9 +371,9 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
                     torch_dtype=dtype or torch.float16,
                     quantization_config=bnb_config,
                     trust_remote_code=True,
-                    attn_implementation="eager"  # Use eager instead of flash_attention_2
                 )
-                logger.info("Model loaded successfully with HF using attn_implementation='eager'")
                 return model, tokenizer
             except Exception as e:
@@ -385,9 +413,19 @@ def train(config_path, dataset_name, output_dir):
     lora_config = config.get("lora_config", {})
     dataset_config = config.get("dataset_config", {})
-    # Override flash attention setting to disable it
-    hardware_config["use_flash_attention"] = False
-    logger.info("Flash attention has been DISABLED due to GPU compatibility issues")
     # Verify this is training phase only
     training_phase_only = dataset_config.get("training_phase_only", True)

     logger.warning(f"Failed to install flash-attention: {e}")
     logger.info("Continuing without flash-attention")
+# Check if flash attention was successfully installed
+flash_attention_available = False
+try:
+    import flash_attn
+    flash_attention_available = True
+    logger.info(f"Flash Attention will be used (version: {flash_attn.__version__})")
+    # We'll handle flash attention configuration during model loading
+except ImportError:
+    logger.info("Flash Attention not available, will use standard attention mechanism")
 # Check if tensorboard is available
 try:
     import tensorboard
     Load the model in a safe way that works with Qwen models
     by trying different loading strategies.
     """
+    global flash_attention_available
     try:
         logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
             logger.info("Falling back to standard Hugging Face loading...")
             # We'll try two approaches with HF loading
+            attn_params = {}
+            # If flash attention is available, try to use it
+            if flash_attention_available:
+                logger.info("Flash Attention is available - setting appropriate parameters")
+                # For newer models that support attn_implementation parameter
+                attn_params = {"attn_implementation": "eager"}  # Default to eager for compatibility
+                # Try to use flash attention if available
+                try:
+                    # Try importing flash attention to confirm it's available
+                    import flash_attn
+                    logger.info(f"Using Flash Attention version {flash_attn.__version__}")
+                    attn_params = {"attn_implementation": "flash_attention_2"}
+                except Exception as flash_error:
+                    logger.warning(f"Flash Attention import failed: {flash_error}")
             # Approach 1: Using attn_implementation parameter (newer method)
             try:
+                logger.info(f"Trying HF loading with attention parameters: {attn_params}")
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
                 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+                # The proper way to set attention implementation in newer transformers
                 model = AutoModelForCausalLM.from_pretrained(
                     model_name,
                     config=config,
                     torch_dtype=dtype or torch.float16,
                     quantization_config=bnb_config,
                     trust_remote_code=True,
+                    **attn_params
                 )
+                logger.info(f"Model loaded successfully with HF using attention parameters: {attn_params}")
                 return model, tokenizer
             except Exception as e:
     lora_config = config.get("lora_config", {})
     dataset_config = config.get("dataset_config", {})
+    # Update flash attention setting based on availability
+    global flash_attention_available
+    if flash_attention_available:
+        logger.info("Flash Attention is available - updating configuration")
+        # If flash attention is available, set attn_implementation to flash_attention_2
+        hardware_config["attn_implementation"] = "flash_attention_2"
+    else:
+        logger.info("Flash Attention not available - setting to eager attention")
+        hardware_config["attn_implementation"] = "eager"
+    # Override flash attention setting to disable it if there are compatibility issues
+    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+    logger.info("Flash attention has been DISABLED globally via environment variable")
     # Verify this is training phase only
     training_phase_only = dataset_config.get("training_phase_only", True)