George-API commited on
Commit
41f3c3b
·
verified ·
1 Parent(s): 4915b65

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +75 -27
run_cloud_training.py CHANGED
@@ -21,6 +21,14 @@ from transformers.data.data_collator import DataCollatorMixin
21
  from peft import LoraConfig
22
  from unsloth import FastLanguageModel
23
 
 
 
 
 
 
 
 
 
24
  # Configure logging
25
  logging.basicConfig(
26
  level=logging.INFO,
@@ -80,7 +88,17 @@ def load_and_prepare_dataset(dataset_name, config):
80
  logger.info(f"Shuffling dataset with seed {shuffle_seed}")
81
  dataset = dataset.shuffle(seed=shuffle_seed)
82
 
 
83
  logger.info(f"Dataset loaded with {len(dataset)} entries")
 
 
 
 
 
 
 
 
 
84
  return dataset
85
 
86
  except Exception as e:
@@ -102,18 +120,47 @@ class PreTokenizedCollator(DataCollatorMixin):
102
  self.pad_token_id = pad_token_id
103
 
104
  def __call__(self, features):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  # Determine max length in this batch
106
- batch_max_len = max(len(x["input_ids"]) for x in features)
107
 
108
  # Initialize batch tensors
109
  batch = {
110
- "input_ids": torch.ones((len(features), batch_max_len), dtype=torch.long) * self.pad_token_id,
111
- "attention_mask": torch.zeros((len(features), batch_max_len), dtype=torch.long),
112
- "labels": torch.ones((len(features), batch_max_len), dtype=torch.long) * -100 # -100 is ignored in loss
113
  }
114
 
115
  # Fill batch tensors
116
- for i, feature in enumerate(features):
117
  input_ids = feature["input_ids"]
118
  seq_len = len(input_ids)
119
 
@@ -274,36 +321,35 @@ def train(config_path, dataset_name, output_dir):
274
  dtype = torch.float16 if hardware_config.get("fp16", True) else None
275
  model, tokenizer = load_model_safely(model_name, max_seq_length, dtype)
276
 
277
- # Apply LoRA - correctly passing lora_config_obj directly
278
  logger.info("Applying LoRA to model")
279
- try:
280
- logger.info("Attempting to apply LoRA with unsloth API")
281
- model = FastLanguageModel.get_peft_model(
282
- model,
283
- lora_config=lora_config_obj, # Pass lora_config directly instead of peft_config
284
- tokenizer=tokenizer,
285
- use_gradient_checkpointing=hardware_config.get("gradient_checkpointing", True)
286
- )
287
- except Exception as e:
288
- logger.warning(f"Error applying LoRA with unsloth: {e}")
289
- logger.info("Falling back to standard PEFT method")
290
-
291
- # Try with standard PEFT approach if unsloth fails
292
- from peft import get_peft_model
293
- model = get_peft_model(model, lora_config_obj)
294
- logger.info("Successfully applied LoRA with standard PEFT")
295
 
 
 
 
 
 
 
296
  # No need to format the dataset - it's already pre-tokenized
297
  logger.info("Using pre-tokenized dataset - skipping tokenization step")
298
  training_dataset = dataset
299
 
300
- # Configure wandb if API key is available
301
- reports = ["tensorboard"]
 
 
 
 
 
 
302
  if os.getenv("WANDB_API_KEY"):
303
  reports.append("wandb")
304
  logger.info("Wandb API key found, enabling wandb reporting")
305
- else:
306
- logger.info("No Wandb API key found, using tensorboard only")
 
 
 
307
 
308
  # Set up training arguments
309
  training_args = TrainingArguments(
@@ -324,7 +370,9 @@ def train(config_path, dataset_name, output_dir):
324
  max_grad_norm=training_config.get("max_grad_norm", 0.3),
325
  report_to=reports,
326
  logging_first_step=training_config.get("logging_first_step", True),
327
- disable_tqdm=training_config.get("disable_tqdm", False)
 
 
328
  )
329
 
330
  # Create trainer with pre-tokenized collator
 
21
  from peft import LoraConfig
22
  from unsloth import FastLanguageModel
23
 
24
+ # Check if tensorboard is available
25
+ try:
26
+ import tensorboard
27
+ TENSORBOARD_AVAILABLE = True
28
+ except ImportError:
29
+ TENSORBOARD_AVAILABLE = False
30
+ print("Tensorboard not available. Will skip tensorboard logging.")
31
+
32
  # Configure logging
33
  logging.basicConfig(
34
  level=logging.INFO,
 
88
  logger.info(f"Shuffling dataset with seed {shuffle_seed}")
89
  dataset = dataset.shuffle(seed=shuffle_seed)
90
 
91
+ # Print dataset structure for debugging
92
  logger.info(f"Dataset loaded with {len(dataset)} entries")
93
+ logger.info(f"Dataset columns: {dataset.column_names}")
94
+
95
+ # Print a sample entry to understand structure
96
+ if len(dataset) > 0:
97
+ sample = dataset[0]
98
+ logger.info(f"Sample entry structure: {list(sample.keys())}")
99
+ if 'conversations' in sample:
100
+ logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
101
+
102
  return dataset
103
 
104
  except Exception as e:
 
120
  self.pad_token_id = pad_token_id
121
 
122
  def __call__(self, features):
123
+ # Print a sample feature to understand structure
124
+ if len(features) > 0:
125
+ logger.info(f"Sample feature keys: {list(features[0].keys())}")
126
+
127
+ # Extract input_ids from conversations if needed
128
+ processed_features = []
129
+ for feature in features:
130
+ # If input_ids is not directly available, try to extract from conversations
131
+ if 'input_ids' not in feature and 'conversations' in feature:
132
+ # Extract from conversations based on your dataset structure
133
+ # This is a placeholder - adjust based on actual structure
134
+ conversations = feature['conversations']
135
+ if isinstance(conversations, list) and len(conversations) > 0:
136
+ # Assuming input_ids might be in the content field
137
+ if 'content' in conversations[0]:
138
+ feature['input_ids'] = conversations[0]['content']
139
+ # Or it might be the conversation itself
140
+ elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
141
+ feature['input_ids'] = conversations[0]['input_ids']
142
+
143
+ processed_features.append(feature)
144
+
145
+ # If we still don't have input_ids, log an error
146
+ if len(processed_features) > 0 and 'input_ids' not in processed_features[0]:
147
+ logger.error(f"Could not find input_ids in features. Available keys: {list(processed_features[0].keys())}")
148
+ if 'conversations' in processed_features[0]:
149
+ logger.error(f"Conversations structure: {processed_features[0]['conversations'][:1]}")
150
+ raise ValueError("Could not find input_ids in dataset. Please check dataset structure.")
151
+
152
  # Determine max length in this batch
153
+ batch_max_len = max(len(x["input_ids"]) for x in processed_features)
154
 
155
  # Initialize batch tensors
156
  batch = {
157
+ "input_ids": torch.ones((len(processed_features), batch_max_len), dtype=torch.long) * self.pad_token_id,
158
+ "attention_mask": torch.zeros((len(processed_features), batch_max_len), dtype=torch.long),
159
+ "labels": torch.ones((len(processed_features), batch_max_len), dtype=torch.long) * -100 # -100 is ignored in loss
160
  }
161
 
162
  # Fill batch tensors
163
+ for i, feature in enumerate(processed_features):
164
  input_ids = feature["input_ids"]
165
  seq_len = len(input_ids)
166
 
 
321
  dtype = torch.float16 if hardware_config.get("fp16", True) else None
322
  model, tokenizer = load_model_safely(model_name, max_seq_length, dtype)
323
 
324
+ # Try different approaches to apply LoRA
325
  logger.info("Applying LoRA to model")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
+ # Skip unsloth's method and go directly to PEFT
328
+ logger.info("Using standard PEFT method to apply LoRA")
329
+ from peft import get_peft_model
330
+ model = get_peft_model(model, lora_config_obj)
331
+ logger.info("Successfully applied LoRA with standard PEFT")
332
+
333
  # No need to format the dataset - it's already pre-tokenized
334
  logger.info("Using pre-tokenized dataset - skipping tokenization step")
335
  training_dataset = dataset
336
 
337
+ # Configure reporting backends with fallbacks
338
+ reports = []
339
+ if TENSORBOARD_AVAILABLE:
340
+ reports.append("tensorboard")
341
+ logger.info("Tensorboard available and enabled for reporting")
342
+ else:
343
+ logger.warning("Tensorboard not available - metrics won't be logged to tensorboard")
344
+
345
  if os.getenv("WANDB_API_KEY"):
346
  reports.append("wandb")
347
  logger.info("Wandb API key found, enabling wandb reporting")
348
+
349
+ # Default to "none" if no reporting backends are available
350
+ if not reports:
351
+ reports = ["none"]
352
+ logger.warning("No reporting backends available - training metrics won't be logged")
353
 
354
  # Set up training arguments
355
  training_args = TrainingArguments(
 
370
  max_grad_norm=training_config.get("max_grad_norm", 0.3),
371
  report_to=reports,
372
  logging_first_step=training_config.get("logging_first_step", True),
373
+ disable_tqdm=training_config.get("disable_tqdm", False),
374
+ # Important: Don't remove columns that don't match model's forward method
375
+ remove_unused_columns=False
376
  )
377
 
378
  # Create trainer with pre-tokenized collator