Spaces:

George-API
/

qwen4bit

Running

App Files Files Community

George-API commited on Mar 11

Commit

41f3c3b

verified ·

1 Parent(s): 4915b65

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +75 -27

run_cloud_training.py CHANGED Viewed

@@ -21,6 +21,14 @@ from transformers.data.data_collator import DataCollatorMixin
 from peft import LoraConfig
 from unsloth import FastLanguageModel
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
@@ -80,7 +88,17 @@ def load_and_prepare_dataset(dataset_name, config):
             logger.info(f"Shuffling dataset with seed {shuffle_seed}")
             dataset = dataset.shuffle(seed=shuffle_seed)
         logger.info(f"Dataset loaded with {len(dataset)} entries")
         return dataset
     except Exception as e:
@@ -102,18 +120,47 @@ class PreTokenizedCollator(DataCollatorMixin):
         self.pad_token_id = pad_token_id
     def __call__(self, features):
         # Determine max length in this batch
-        batch_max_len = max(len(x["input_ids"]) for x in features)
         # Initialize batch tensors
         batch = {
-            "input_ids": torch.ones((len(features), batch_max_len), dtype=torch.long) * self.pad_token_id,
-            "attention_mask": torch.zeros((len(features), batch_max_len), dtype=torch.long),
-            "labels": torch.ones((len(features), batch_max_len), dtype=torch.long) * -100  # -100 is ignored in loss
         }
         # Fill batch tensors
-        for i, feature in enumerate(features):
             input_ids = feature["input_ids"]
             seq_len = len(input_ids)
@@ -274,36 +321,35 @@ def train(config_path, dataset_name, output_dir):
         dtype = torch.float16 if hardware_config.get("fp16", True) else None
         model, tokenizer = load_model_safely(model_name, max_seq_length, dtype)
-        # Apply LoRA - correctly passing lora_config_obj directly
         logger.info("Applying LoRA to model")
-        try:
-            logger.info("Attempting to apply LoRA with unsloth API")
-            model = FastLanguageModel.get_peft_model(
-                model,
-                lora_config=lora_config_obj,  # Pass lora_config directly instead of peft_config
-                tokenizer=tokenizer,
-                use_gradient_checkpointing=hardware_config.get("gradient_checkpointing", True)
-            )
-        except Exception as e:
-            logger.warning(f"Error applying LoRA with unsloth: {e}")
-            logger.info("Falling back to standard PEFT method")
-            # Try with standard PEFT approach if unsloth fails
-            from peft import get_peft_model
-            model = get_peft_model(model, lora_config_obj)
-            logger.info("Successfully applied LoRA with standard PEFT")
         # No need to format the dataset - it's already pre-tokenized
         logger.info("Using pre-tokenized dataset - skipping tokenization step")
         training_dataset = dataset
-        # Configure wandb if API key is available
-        reports = ["tensorboard"]
         if os.getenv("WANDB_API_KEY"):
             reports.append("wandb")
             logger.info("Wandb API key found, enabling wandb reporting")
-        else:
-            logger.info("No Wandb API key found, using tensorboard only")
         # Set up training arguments
         training_args = TrainingArguments(
@@ -324,7 +370,9 @@ def train(config_path, dataset_name, output_dir):
             max_grad_norm=training_config.get("max_grad_norm", 0.3),
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
-            disable_tqdm=training_config.get("disable_tqdm", False)
         )
         # Create trainer with pre-tokenized collator

 from peft import LoraConfig
 from unsloth import FastLanguageModel
+# Check if tensorboard is available
+try:
+    import tensorboard
+    TENSORBOARD_AVAILABLE = True
+except ImportError:
+    TENSORBOARD_AVAILABLE = False
+    print("Tensorboard not available. Will skip tensorboard logging.")
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
             logger.info(f"Shuffling dataset with seed {shuffle_seed}")
             dataset = dataset.shuffle(seed=shuffle_seed)
+        # Print dataset structure for debugging
         logger.info(f"Dataset loaded with {len(dataset)} entries")
+        logger.info(f"Dataset columns: {dataset.column_names}")
+        # Print a sample entry to understand structure
+        if len(dataset) > 0:
+            sample = dataset[0]
+            logger.info(f"Sample entry structure: {list(sample.keys())}")
+            if 'conversations' in sample:
+                logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
         return dataset
     except Exception as e:
         self.pad_token_id = pad_token_id
     def __call__(self, features):
+        # Print a sample feature to understand structure
+        if len(features) > 0:
+            logger.info(f"Sample feature keys: {list(features[0].keys())}")
+        # Extract input_ids from conversations if needed
+        processed_features = []
+        for feature in features:
+            # If input_ids is not directly available, try to extract from conversations
+            if 'input_ids' not in feature and 'conversations' in feature:
+                # Extract from conversations based on your dataset structure
+                # This is a placeholder - adjust based on actual structure
+                conversations = feature['conversations']
+                if isinstance(conversations, list) and len(conversations) > 0:
+                    # Assuming input_ids might be in the content field
+                    if 'content' in conversations[0]:
+                        feature['input_ids'] = conversations[0]['content']
+                    # Or it might be the conversation itself
+                    elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
+                        feature['input_ids'] = conversations[0]['input_ids']
+            processed_features.append(feature)
+        # If we still don't have input_ids, log an error
+        if len(processed_features) > 0 and 'input_ids' not in processed_features[0]:
+            logger.error(f"Could not find input_ids in features. Available keys: {list(processed_features[0].keys())}")
+            if 'conversations' in processed_features[0]:
+                logger.error(f"Conversations structure: {processed_features[0]['conversations'][:1]}")
+            raise ValueError("Could not find input_ids in dataset. Please check dataset structure.")
         # Determine max length in this batch
+        batch_max_len = max(len(x["input_ids"]) for x in processed_features)
         # Initialize batch tensors
         batch = {
+            "input_ids": torch.ones((len(processed_features), batch_max_len), dtype=torch.long) * self.pad_token_id,
+            "attention_mask": torch.zeros((len(processed_features), batch_max_len), dtype=torch.long),
+            "labels": torch.ones((len(processed_features), batch_max_len), dtype=torch.long) * -100  # -100 is ignored in loss
         }
         # Fill batch tensors
+        for i, feature in enumerate(processed_features):
             input_ids = feature["input_ids"]
             seq_len = len(input_ids)
         dtype = torch.float16 if hardware_config.get("fp16", True) else None
         model, tokenizer = load_model_safely(model_name, max_seq_length, dtype)
+        # Try different approaches to apply LoRA
         logger.info("Applying LoRA to model")
+        # Skip unsloth's method and go directly to PEFT
+        logger.info("Using standard PEFT method to apply LoRA")
+        from peft import get_peft_model
+        model = get_peft_model(model, lora_config_obj)
+        logger.info("Successfully applied LoRA with standard PEFT")
         # No need to format the dataset - it's already pre-tokenized
         logger.info("Using pre-tokenized dataset - skipping tokenization step")
         training_dataset = dataset
+        # Configure reporting backends with fallbacks
+        reports = []
+        if TENSORBOARD_AVAILABLE:
+            reports.append("tensorboard")
+            logger.info("Tensorboard available and enabled for reporting")
+        else:
+            logger.warning("Tensorboard not available - metrics won't be logged to tensorboard")
         if os.getenv("WANDB_API_KEY"):
             reports.append("wandb")
             logger.info("Wandb API key found, enabling wandb reporting")
+        # Default to "none" if no reporting backends are available
+        if not reports:
+            reports = ["none"]
+            logger.warning("No reporting backends available - training metrics won't be logged")
         # Set up training arguments
         training_args = TrainingArguments(
             max_grad_norm=training_config.get("max_grad_norm", 0.3),
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
+            disable_tqdm=training_config.get("disable_tqdm", False),
+            # Important: Don't remove columns that don't match model's forward method
+            remove_unused_columns=False
         )
         # Create trainer with pre-tokenized collator