Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

9ef545f

verified ·

1 Parent(s): 60950b2

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +26 -5

run_cloud_training.py CHANGED Viewed

@@ -16,11 +16,14 @@ from dotenv import load_dotenv
 import torch
 from datasets import load_dataset
 import transformers
-from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
 from transformers.data.data_collator import DataCollatorMixin
 from peft import LoraConfig
 from unsloth import FastLanguageModel
 # Check if tensorboard is available
 try:
     import tensorboard
@@ -263,13 +266,16 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
         # First try the standard unsloth loading
         try:
             # Try loading with unsloth but without the problematic parameter
             model, tokenizer = FastLanguageModel.from_pretrained(
                 model_name=model_name,
                 max_seq_length=max_seq_length,
                 dtype=dtype,
                 load_in_4bit=True,  # This should work for already quantized models
             )
-            logger.info("Model loaded successfully with unsloth with 4-bit quantization")
             return model, tokenizer
         except TypeError as e:
@@ -283,6 +289,7 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
                     model_name=model_name,
                     max_seq_length=max_seq_length,
                     dtype=dtype,
                 )
                 logger.info("Model loaded successfully with unsloth using alternative method")
                 return model, tokenizer
@@ -295,14 +302,22 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
         logger.warning(f"Unsloth loading failed: {e}")
         logger.info("Falling back to standard Hugging Face loading...")
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             device_map="auto",
             torch_dtype=dtype or torch.float16,
             load_in_4bit=True,
         )
-        logger.info("Model loaded successfully with standard HF loading")
         return model, tokenizer
 def train(config_path, dataset_name, output_dir):
@@ -318,6 +333,10 @@ def train(config_path, dataset_name, output_dir):
     lora_config = config.get("lora_config", {})
     dataset_config = config.get("dataset_config", {})
     # Verify this is training phase only
     training_phase_only = dataset_config.get("training_phase_only", True)
     if not training_phase_only:
@@ -404,7 +423,7 @@ def train(config_path, dataset_name, output_dir):
             reports = ["none"]
             logger.warning("No reporting backends available - training metrics won't be logged")
-        # Set up training arguments
         training_args = TrainingArguments(
             output_dir=output_dir,
             num_train_epochs=training_config.get("num_train_epochs", 3),
@@ -425,7 +444,9 @@ def train(config_path, dataset_name, output_dir):
             logging_first_step=training_config.get("logging_first_step", True),
             disable_tqdm=training_config.get("disable_tqdm", False),
             # Important: Don't remove columns that don't match model's forward method
-            remove_unused_columns=False
         )
         # Create trainer with pre-tokenized collator

 import torch
 from datasets import load_dataset
 import transformers
+from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, AutoConfig
 from transformers.data.data_collator import DataCollatorMixin
 from peft import LoraConfig
 from unsloth import FastLanguageModel
+# Disable flash attention globally
+os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Check if tensorboard is available
 try:
     import tensorboard
         # First try the standard unsloth loading
         try:
             # Try loading with unsloth but without the problematic parameter
+            logger.info("Loading model with flash attention DISABLED")
             model, tokenizer = FastLanguageModel.from_pretrained(
                 model_name=model_name,
                 max_seq_length=max_seq_length,
                 dtype=dtype,
                 load_in_4bit=True,  # This should work for already quantized models
+                use_flash_attention=False,  # Explicitly disable flash attention
+                attn_implementation="eager"  # Use eager implementation instead
             )
+            logger.info("Model loaded successfully with unsloth with 4-bit quantization and flash attention disabled")
             return model, tokenizer
         except TypeError as e:
                     model_name=model_name,
                     max_seq_length=max_seq_length,
                     dtype=dtype,
+                    use_flash_attention=False,  # Explicitly disable flash attention
                 )
                 logger.info("Model loaded successfully with unsloth using alternative method")
                 return model, tokenizer
         logger.warning(f"Unsloth loading failed: {e}")
         logger.info("Falling back to standard Hugging Face loading...")
+        # Disable flash attention in transformers config
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        if hasattr(config, "use_flash_attention"):
+            config.use_flash_attention = False
+            logger.info("Disabled flash attention in model config")
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            config=config,
             device_map="auto",
             torch_dtype=dtype or torch.float16,
             load_in_4bit=True,
+            attn_implementation="eager"  # Use eager implementation instead of flash attention
         )
+        logger.info("Model loaded successfully with standard HF loading and flash attention disabled")
         return model, tokenizer
 def train(config_path, dataset_name, output_dir):
     lora_config = config.get("lora_config", {})
     dataset_config = config.get("dataset_config", {})
+    # Override flash attention setting to disable it
+    hardware_config["use_flash_attention"] = False
+    logger.info("Flash attention has been DISABLED due to GPU compatibility issues")
     # Verify this is training phase only
     training_phase_only = dataset_config.get("training_phase_only", True)
     if not training_phase_only:
             reports = ["none"]
             logger.warning("No reporting backends available - training metrics won't be logged")
+        # Set up training arguments with flash attention disabled
         training_args = TrainingArguments(
             output_dir=output_dir,
             num_train_epochs=training_config.get("num_train_epochs", 3),
             logging_first_step=training_config.get("logging_first_step", True),
             disable_tqdm=training_config.get("disable_tqdm", False),
             # Important: Don't remove columns that don't match model's forward method
+            remove_unused_columns=False,
+            # Disable flash attention
+            attn_implementation="eager"
         )
         # Create trainer with pre-tokenized collator