Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

a08fcdc

verified ·

1 Parent(s): 8ef55a9

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +118 -80

run_cloud_training.py CHANGED Viewed

@@ -24,6 +24,21 @@ from unsloth import FastLanguageModel
 # Disable flash attention globally
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Check if tensorboard is available
 try:
     import tensorboard
@@ -76,20 +91,25 @@ def load_and_prepare_dataset(dataset_name, config):
         # Get the dataset config
         dataset_config = config.get("dataset_config", {})
         sort_field = dataset_config.get("sort_by_field", "prompt_number")
-        sort_direction = dataset_config.get("sort_direction", "ascending")
-        # Sort the dataset by prompt_number
-        logger.info(f"Sorting dataset by {sort_field} in {sort_direction} order")
-        if sort_direction == "ascending":
-            dataset = dataset.sort(sort_field)
-        else:
-            dataset = dataset.sort(sort_field, reverse=True)
-        # Add shuffle with fixed seed if specified
-        if "shuffle_seed" in dataset_config:
-            shuffle_seed = dataset_config.get("shuffle_seed")
-            logger.info(f"Shuffling dataset with seed {shuffle_seed}")
-            dataset = dataset.shuffle(seed=shuffle_seed)
         # Print dataset structure for debugging
         logger.info(f"Dataset loaded with {len(dataset)} entries")
@@ -263,62 +283,77 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
     """
     try:
         logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
-        # First try the standard unsloth loading
         try:
-            # Try loading with unsloth but without the problematic parameter
-            logger.info("Loading model with flash attention DISABLED")
             model, tokenizer = FastLanguageModel.from_pretrained(
                 model_name=model_name,
                 max_seq_length=max_seq_length,
                 dtype=dtype,
-                load_in_4bit=True,  # This should work for already quantized models
-                use_flash_attention=False,  # Explicitly disable flash attention
-                attn_implementation="eager"  # Use eager implementation instead
             )
-            logger.info("Model loaded successfully with unsloth with 4-bit quantization and flash attention disabled")
             return model, tokenizer
-        except TypeError as e:
-            # If we get a TypeError about unexpected keyword arguments
-            if "unexpected keyword argument" in str(e):
-                logger.warning(f"Unsloth loading error with 4-bit: {e}")
-                logger.info("Trying alternative loading method for Qwen model...")
-                # Try loading with different parameters for Qwen model
-                model, tokenizer = FastLanguageModel.from_pretrained(
-                    model_name=model_name,
-                    max_seq_length=max_seq_length,
-                    dtype=dtype,
-                    use_flash_attention=False,  # Explicitly disable flash attention
                 )
-                logger.info("Model loaded successfully with unsloth using alternative method")
                 return model, tokenizer
-            else:
-                # Re-raise if it's a different type error
-                raise
     except Exception as e:
-        # Fallback to standard loading if unsloth methods fail
-        logger.warning(f"Unsloth loading failed: {e}")
-        logger.info("Falling back to standard Hugging Face loading...")
-        # Disable flash attention in transformers config
-        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-        if hasattr(config, "use_flash_attention"):
-            config.use_flash_attention = False
-            logger.info("Disabled flash attention in model config")
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            config=config,
-            device_map="auto",
-            torch_dtype=dtype or torch.float16,
-            load_in_4bit=True,
-            attn_implementation="eager"  # Use eager implementation instead of flash attention
-        )
-        logger.info("Model loaded successfully with standard HF loading and flash attention disabled")
-        return model, tokenizer
 def train(config_path, dataset_name, output_dir):
     """Main training function - RESEARCH TRAINING PHASE ONLY"""
@@ -423,31 +458,34 @@ def train(config_path, dataset_name, output_dir):
             reports = ["none"]
             logger.warning("No reporting backends available - training metrics won't be logged")
-        # Set up training arguments with flash attention disabled
-        training_args = TrainingArguments(
-            output_dir=output_dir,
-            num_train_epochs=training_config.get("num_train_epochs", 3),
-            per_device_train_batch_size=training_config.get("per_device_train_batch_size", 2),
-            gradient_accumulation_steps=training_config.get("gradient_accumulation_steps", 4),
-            learning_rate=training_config.get("learning_rate", 2e-5),
-            lr_scheduler_type=training_config.get("lr_scheduler_type", "cosine"),
-            warmup_ratio=training_config.get("warmup_ratio", 0.03),
-            weight_decay=training_config.get("weight_decay", 0.01),
-            optim=training_config.get("optim", "adamw_torch"),
-            logging_steps=training_config.get("logging_steps", 10),
-            save_steps=training_config.get("save_steps", 200),
-            save_total_limit=training_config.get("save_total_limit", 3),
-            fp16=hardware_config.get("fp16", True),
-            bf16=hardware_config.get("bf16", False),
-            max_grad_norm=training_config.get("max_grad_norm", 0.3),
-            report_to=reports,
-            logging_first_step=training_config.get("logging_first_step", True),
-            disable_tqdm=training_config.get("disable_tqdm", False),
-            # Important: Don't remove columns that don't match model's forward method
-            remove_unused_columns=False,
-            # Disable flash attention
-            attn_implementation="eager"
-        )
         # Create trainer with pre-tokenized collator
         trainer = Trainer(

 # Disable flash attention globally
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+# Try to install flash-attention (for systems that support it)
+try:
+    import subprocess
+    import sys
+    logger = logging.getLogger(__name__)
+    logger.info("Attempting to install flash-attention...")
+    # Install flash-attention
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "flash-attn", "--no-build-isolation"])
+    logger.info("Successfully installed flash-attention")
+except Exception as e:
+    logger.warning(f"Failed to install flash-attention: {e}")
+    logger.info("Continuing without flash-attention")
 # Check if tensorboard is available
 try:
     import tensorboard
         # Get the dataset config
         dataset_config = config.get("dataset_config", {})
         sort_field = dataset_config.get("sort_by_field", "prompt_number")
+        # Always sort in ascending order by prompt_number
+        logger.info(f"Sorting dataset by {sort_field} in ascending order")
+        dataset = dataset.sort(sort_field)
+        # Verify sorting
+        if len(dataset) > 1:
+            first_prompt = dataset[0].get(sort_field, None)
+            last_prompt = dataset[-1].get(sort_field, None)
+            logger.info(f"Dataset sorted: first {sort_field}={first_prompt}, last {sort_field}={last_prompt}")
+            # Additional verification of a few samples
+            sample_indices = [0, len(dataset)//2, len(dataset)-1]
+            sample_prompts = [dataset[i].get(sort_field, None) for i in sample_indices]
+            logger.info(f"Sample prompt numbers: {sample_prompts}")
+            # Verify order is ascending
+            if not all(sample_prompts[i] <= sample_prompts[i+1] for i in range(len(sample_prompts)-1)):
+                logger.warning("Dataset may not be properly sorted! Please check the ordering.")
         # Print dataset structure for debugging
         logger.info(f"Dataset loaded with {len(dataset)} entries")
     """
     try:
         logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
+        # Create BitsAndBytesConfig for 4-bit quantization
+        from transformers import BitsAndBytesConfig
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True
+        )
+        # First try loading with unsloth but without flash attention
         try:
+            logger.info("Loading model with unsloth optimizations")
+            # Don't pass any flash attention parameters to unsloth
             model, tokenizer = FastLanguageModel.from_pretrained(
                 model_name=model_name,
                 max_seq_length=max_seq_length,
                 dtype=dtype,
+                quantization_config=bnb_config
             )
+            logger.info("Model loaded successfully with unsloth")
             return model, tokenizer
+        except Exception as e:
+            logger.warning(f"Unsloth loading failed: {e}")
+            logger.info("Falling back to standard Hugging Face loading...")
+            # We'll try two approaches with HF loading
+            # Approach 1: Using attn_implementation parameter (newer method)
+            try:
+                logger.info("Trying HF loading with attn_implementation parameter")
+                config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+                tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+                # The proper way to disable flash attention in newer transformers
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    config=config,
+                    device_map="auto",
+                    torch_dtype=dtype or torch.float16,
+                    quantization_config=bnb_config,
+                    trust_remote_code=True,
+                    attn_implementation="eager"  # Use eager instead of flash_attention_2
                 )
+                logger.info("Model loaded successfully with HF using attn_implementation='eager'")
+                return model, tokenizer
+            except Exception as e:
+                logger.warning(f"HF loading with attn_implementation failed: {e}")
+                logger.info("Trying fallback method...")
+                # Approach 2: Complete fallback with minimal parameters
+                config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+                tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+                # Most basic loading without any attention parameters
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    config=config,
+                    device_map="auto",
+                    torch_dtype=dtype or torch.float16,
+                    quantization_config=bnb_config,
+                    trust_remote_code=True
+                )
+                logger.info("Model loaded successfully with basic HF loading")
                 return model, tokenizer
     except Exception as e:
+        logger.error(f"All model loading attempts failed: {e}")
+        raise
 def train(config_path, dataset_name, output_dir):
     """Main training function - RESEARCH TRAINING PHASE ONLY"""
             reports = ["none"]
             logger.warning("No reporting backends available - training metrics won't be logged")
+        # Set up training arguments with correct parameters
+        # Extract only the valid parameters from hardware_config
+        training_args_dict = {
+            "output_dir": output_dir,
+            "num_train_epochs": training_config.get("num_train_epochs", 3),
+            "per_device_train_batch_size": training_config.get("per_device_train_batch_size", 2),
+            "gradient_accumulation_steps": training_config.get("gradient_accumulation_steps", 4),
+            "learning_rate": training_config.get("learning_rate", 2e-5),
+            "lr_scheduler_type": training_config.get("lr_scheduler_type", "cosine"),
+            "warmup_ratio": training_config.get("warmup_ratio", 0.03),
+            "weight_decay": training_config.get("weight_decay", 0.01),
+            "optim": training_config.get("optim", "adamw_torch"),
+            "logging_steps": training_config.get("logging_steps", 10),
+            "save_steps": training_config.get("save_steps", 200),
+            "save_total_limit": training_config.get("save_total_limit", 3),
+            "fp16": hardware_config.get("fp16", True),
+            "bf16": hardware_config.get("bf16", False),
+            "max_grad_norm": training_config.get("max_grad_norm", 0.3),
+            "report_to": reports,
+            "logging_first_step": training_config.get("logging_first_step", True),
+            "disable_tqdm": training_config.get("disable_tqdm", False),
+            "remove_unused_columns": False,
+            "shuffle_buffer_size": 1,
+            "seed": 42
+        }
+        # Create TrainingArguments with validated parameters
+        training_args = TrainingArguments(**training_args_dict)
         # Create trainer with pre-tokenized collator
         trainer = Trainer(