George-API commited on
Commit
6b2c2bc
·
verified ·
1 Parent(s): 00a06ef

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +50 -16
run_cloud_training.py CHANGED
@@ -135,7 +135,7 @@ def load_and_prepare_dataset(dataset_name, config):
135
  """
136
  Load and prepare the dataset for fine-tuning.
137
  Sort entries by prompt_number as required.
138
- NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
139
  """
140
  # Use the default dataset path if no specific path is provided
141
  if dataset_name == "phi4-cognitive-dataset":
@@ -182,9 +182,35 @@ def load_and_prepare_dataset(dataset_name, config):
182
  if len(dataset) > 0:
183
  sample = dataset[0]
184
  logger.info(f"Sample entry structure: {list(sample.keys())}")
185
- if 'conversations' in sample:
 
 
 
 
 
 
 
186
  logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  return dataset
189
 
190
  except Exception as e:
@@ -208,12 +234,12 @@ def tokenize_string(text, tokenizer):
208
  # Data collator for pre-tokenized dataset
209
  class PreTokenizedCollator(DataCollatorMixin):
210
  """
211
- Data collator for pre-tokenized datasets.
212
- Expects input_ids and labels already tokenized.
213
  """
214
  def __init__(self, pad_token_id=0, tokenizer=None):
215
  self.pad_token_id = pad_token_id
216
- self.tokenizer = tokenizer # Keep a reference to the tokenizer for debugging only
217
 
218
  def __call__(self, features):
219
  # Print a sample feature to understand structure
@@ -251,26 +277,33 @@ class PreTokenizedCollator(DataCollatorMixin):
251
  feature['input_ids'] = conversations
252
 
253
  # Case 3: If conversations is a list of dicts with 'content' field
254
- # This should be avoided for pre-tokenized datasets
255
  elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
256
  content = conversations[0]['content']
257
 
258
  # If content is already a list of integers, use it directly
259
  if isinstance(content, list) and all(isinstance(x, int) for x in content):
260
  feature['input_ids'] = content
261
- # AVOID TOKENIZATION: Log warning if content is a string
262
- elif isinstance(content, str):
263
- logger.warning("Found string content in pre-tokenized dataset. This should not happen.")
264
- logger.warning("Skipping this example to avoid tokenization.")
 
 
265
  continue
 
 
 
 
 
 
 
266
 
267
  # Ensure input_ids is a list of integers
268
  if 'input_ids' in feature:
269
- # AVOID TOKENIZATION: Skip string input_ids
270
- if isinstance(feature['input_ids'], str):
271
- logger.warning("Found string input_ids in pre-tokenized dataset. This should not happen.")
272
- logger.warning("Skipping this example to avoid tokenization.")
273
- continue
274
  # If input_ids is not a list, convert it
275
  elif not isinstance(feature['input_ids'], list):
276
  try:
@@ -569,7 +602,8 @@ def train(config_path, dataset_name, output_dir):
569
  logger.info("Successfully applied LoRA with standard PEFT")
570
 
571
  # No need to format the dataset - it's already pre-tokenized
572
- logger.info("Using pre-tokenized dataset - skipping tokenization step")
 
573
  training_dataset = dataset
574
 
575
  # Configure reporting backends with fallbacks
 
135
  """
136
  Load and prepare the dataset for fine-tuning.
137
  Sort entries by prompt_number as required.
138
+ Handles both pre-tokenized and string content.
139
  """
140
  # Use the default dataset path if no specific path is provided
141
  if dataset_name == "phi4-cognitive-dataset":
 
182
  if len(dataset) > 0:
183
  sample = dataset[0]
184
  logger.info(f"Sample entry structure: {list(sample.keys())}")
185
+
186
+ # Check if dataset is pre-tokenized or contains string content
187
+ is_pre_tokenized = False
188
+
189
+ if 'input_ids' in sample and isinstance(sample['input_ids'], list) and all(isinstance(x, int) for x in sample['input_ids']):
190
+ logger.info("Dataset appears to be pre-tokenized with input_ids field")
191
+ is_pre_tokenized = True
192
+ elif 'conversations' in sample:
193
  logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
194
 
195
+ # Check if conversations contain pre-tokenized data
196
+ if isinstance(sample['conversations'], list) and len(sample['conversations']) > 0:
197
+ conv = sample['conversations'][0]
198
+ if isinstance(conv, dict) and 'input_ids' in conv and isinstance(conv['input_ids'], list):
199
+ logger.info("Dataset appears to be pre-tokenized in conversations.input_ids")
200
+ is_pre_tokenized = True
201
+ elif isinstance(conv, dict) and 'content' in conv:
202
+ content = conv['content']
203
+ if isinstance(content, list) and all(isinstance(x, int) for x in content):
204
+ logger.info("Dataset appears to be pre-tokenized in conversations.content")
205
+ is_pre_tokenized = True
206
+ else:
207
+ logger.info("Dataset appears to contain string content that will need tokenization")
208
+
209
+ if is_pre_tokenized:
210
+ logger.info("Using pre-tokenized dataset - tokenizer will only be used as fallback")
211
+ else:
212
+ logger.info("Dataset contains string content - tokenizer will be used")
213
+
214
  return dataset
215
 
216
  except Exception as e:
 
234
  # Data collator for pre-tokenized dataset
235
  class PreTokenizedCollator(DataCollatorMixin):
236
  """
237
+ Data collator that can handle both pre-tokenized datasets and string content.
238
+ Will tokenize strings if necessary, but logs warnings.
239
  """
240
  def __init__(self, pad_token_id=0, tokenizer=None):
241
  self.pad_token_id = pad_token_id
242
+ self.tokenizer = tokenizer # Keep a reference to the tokenizer for fallback tokenization
243
 
244
  def __call__(self, features):
245
  # Print a sample feature to understand structure
 
277
  feature['input_ids'] = conversations
278
 
279
  # Case 3: If conversations is a list of dicts with 'content' field
 
280
  elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
281
  content = conversations[0]['content']
282
 
283
  # If content is already a list of integers, use it directly
284
  if isinstance(content, list) and all(isinstance(x, int) for x in content):
285
  feature['input_ids'] = content
286
+ # If content is a string, tokenize it with a warning
287
+ elif isinstance(content, str) and self.tokenizer:
288
+ logger.warning("Found string content in dataset. Tokenizing as fallback.")
289
+ feature['input_ids'] = self.tokenizer.encode(content, add_special_tokens=False)
290
+ else:
291
+ logger.warning(f"Unexpected content format: {type(content)}")
292
  continue
293
+
294
+ # Case 4: If conversations is a list of strings
295
+ elif all(isinstance(x, str) for x in conversations) and self.tokenizer:
296
+ # Join all strings and tokenize
297
+ logger.warning("Found string conversations in dataset. Tokenizing as fallback.")
298
+ full_text = " ".join(conversations)
299
+ feature['input_ids'] = self.tokenizer.encode(full_text, add_special_tokens=False)
300
 
301
  # Ensure input_ids is a list of integers
302
  if 'input_ids' in feature:
303
+ # If input_ids is a string, tokenize it
304
+ if isinstance(feature['input_ids'], str) and self.tokenizer:
305
+ logger.warning("Found string input_ids in dataset. Tokenizing as fallback.")
306
+ feature['input_ids'] = self.tokenizer.encode(feature['input_ids'], add_special_tokens=False)
 
307
  # If input_ids is not a list, convert it
308
  elif not isinstance(feature['input_ids'], list):
309
  try:
 
602
  logger.info("Successfully applied LoRA with standard PEFT")
603
 
604
  # No need to format the dataset - it's already pre-tokenized
605
+ logger.info("Using dataset with flexible tokenization handling")
606
+ logger.info("Will use pre-tokenized data if available, or tokenize strings as fallback")
607
  training_dataset = dataset
608
 
609
  # Configure reporting backends with fallbacks