Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +50 -16
run_cloud_training.py
CHANGED
@@ -135,7 +135,7 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
135 |
"""
|
136 |
Load and prepare the dataset for fine-tuning.
|
137 |
Sort entries by prompt_number as required.
|
138 |
-
|
139 |
"""
|
140 |
# Use the default dataset path if no specific path is provided
|
141 |
if dataset_name == "phi4-cognitive-dataset":
|
@@ -182,9 +182,35 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
182 |
if len(dataset) > 0:
|
183 |
sample = dataset[0]
|
184 |
logger.info(f"Sample entry structure: {list(sample.keys())}")
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
return dataset
|
189 |
|
190 |
except Exception as e:
|
@@ -208,12 +234,12 @@ def tokenize_string(text, tokenizer):
|
|
208 |
# Data collator for pre-tokenized dataset
|
209 |
class PreTokenizedCollator(DataCollatorMixin):
|
210 |
"""
|
211 |
-
Data collator
|
212 |
-
|
213 |
"""
|
214 |
def __init__(self, pad_token_id=0, tokenizer=None):
|
215 |
self.pad_token_id = pad_token_id
|
216 |
-
self.tokenizer = tokenizer # Keep a reference to the tokenizer for
|
217 |
|
218 |
def __call__(self, features):
|
219 |
# Print a sample feature to understand structure
|
@@ -251,26 +277,33 @@ class PreTokenizedCollator(DataCollatorMixin):
|
|
251 |
feature['input_ids'] = conversations
|
252 |
|
253 |
# Case 3: If conversations is a list of dicts with 'content' field
|
254 |
-
# This should be avoided for pre-tokenized datasets
|
255 |
elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
|
256 |
content = conversations[0]['content']
|
257 |
|
258 |
# If content is already a list of integers, use it directly
|
259 |
if isinstance(content, list) and all(isinstance(x, int) for x in content):
|
260 |
feature['input_ids'] = content
|
261 |
-
#
|
262 |
-
elif isinstance(content, str):
|
263 |
-
logger.warning("Found string content in
|
264 |
-
|
|
|
|
|
265 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
|
267 |
# Ensure input_ids is a list of integers
|
268 |
if 'input_ids' in feature:
|
269 |
-
#
|
270 |
-
if isinstance(feature['input_ids'], str):
|
271 |
-
logger.warning("Found string input_ids in
|
272 |
-
|
273 |
-
continue
|
274 |
# If input_ids is not a list, convert it
|
275 |
elif not isinstance(feature['input_ids'], list):
|
276 |
try:
|
@@ -569,7 +602,8 @@ def train(config_path, dataset_name, output_dir):
|
|
569 |
logger.info("Successfully applied LoRA with standard PEFT")
|
570 |
|
571 |
# No need to format the dataset - it's already pre-tokenized
|
572 |
-
logger.info("Using
|
|
|
573 |
training_dataset = dataset
|
574 |
|
575 |
# Configure reporting backends with fallbacks
|
|
|
135 |
"""
|
136 |
Load and prepare the dataset for fine-tuning.
|
137 |
Sort entries by prompt_number as required.
|
138 |
+
Handles both pre-tokenized and string content.
|
139 |
"""
|
140 |
# Use the default dataset path if no specific path is provided
|
141 |
if dataset_name == "phi4-cognitive-dataset":
|
|
|
182 |
if len(dataset) > 0:
|
183 |
sample = dataset[0]
|
184 |
logger.info(f"Sample entry structure: {list(sample.keys())}")
|
185 |
+
|
186 |
+
# Check if dataset is pre-tokenized or contains string content
|
187 |
+
is_pre_tokenized = False
|
188 |
+
|
189 |
+
if 'input_ids' in sample and isinstance(sample['input_ids'], list) and all(isinstance(x, int) for x in sample['input_ids']):
|
190 |
+
logger.info("Dataset appears to be pre-tokenized with input_ids field")
|
191 |
+
is_pre_tokenized = True
|
192 |
+
elif 'conversations' in sample:
|
193 |
logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
|
194 |
|
195 |
+
# Check if conversations contain pre-tokenized data
|
196 |
+
if isinstance(sample['conversations'], list) and len(sample['conversations']) > 0:
|
197 |
+
conv = sample['conversations'][0]
|
198 |
+
if isinstance(conv, dict) and 'input_ids' in conv and isinstance(conv['input_ids'], list):
|
199 |
+
logger.info("Dataset appears to be pre-tokenized in conversations.input_ids")
|
200 |
+
is_pre_tokenized = True
|
201 |
+
elif isinstance(conv, dict) and 'content' in conv:
|
202 |
+
content = conv['content']
|
203 |
+
if isinstance(content, list) and all(isinstance(x, int) for x in content):
|
204 |
+
logger.info("Dataset appears to be pre-tokenized in conversations.content")
|
205 |
+
is_pre_tokenized = True
|
206 |
+
else:
|
207 |
+
logger.info("Dataset appears to contain string content that will need tokenization")
|
208 |
+
|
209 |
+
if is_pre_tokenized:
|
210 |
+
logger.info("Using pre-tokenized dataset - tokenizer will only be used as fallback")
|
211 |
+
else:
|
212 |
+
logger.info("Dataset contains string content - tokenizer will be used")
|
213 |
+
|
214 |
return dataset
|
215 |
|
216 |
except Exception as e:
|
|
|
234 |
# Data collator for pre-tokenized dataset
|
235 |
class PreTokenizedCollator(DataCollatorMixin):
|
236 |
"""
|
237 |
+
Data collator that can handle both pre-tokenized datasets and string content.
|
238 |
+
Will tokenize strings if necessary, but logs warnings.
|
239 |
"""
|
240 |
def __init__(self, pad_token_id=0, tokenizer=None):
|
241 |
self.pad_token_id = pad_token_id
|
242 |
+
self.tokenizer = tokenizer # Keep a reference to the tokenizer for fallback tokenization
|
243 |
|
244 |
def __call__(self, features):
|
245 |
# Print a sample feature to understand structure
|
|
|
277 |
feature['input_ids'] = conversations
|
278 |
|
279 |
# Case 3: If conversations is a list of dicts with 'content' field
|
|
|
280 |
elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
|
281 |
content = conversations[0]['content']
|
282 |
|
283 |
# If content is already a list of integers, use it directly
|
284 |
if isinstance(content, list) and all(isinstance(x, int) for x in content):
|
285 |
feature['input_ids'] = content
|
286 |
+
# If content is a string, tokenize it with a warning
|
287 |
+
elif isinstance(content, str) and self.tokenizer:
|
288 |
+
logger.warning("Found string content in dataset. Tokenizing as fallback.")
|
289 |
+
feature['input_ids'] = self.tokenizer.encode(content, add_special_tokens=False)
|
290 |
+
else:
|
291 |
+
logger.warning(f"Unexpected content format: {type(content)}")
|
292 |
continue
|
293 |
+
|
294 |
+
# Case 4: If conversations is a list of strings
|
295 |
+
elif all(isinstance(x, str) for x in conversations) and self.tokenizer:
|
296 |
+
# Join all strings and tokenize
|
297 |
+
logger.warning("Found string conversations in dataset. Tokenizing as fallback.")
|
298 |
+
full_text = " ".join(conversations)
|
299 |
+
feature['input_ids'] = self.tokenizer.encode(full_text, add_special_tokens=False)
|
300 |
|
301 |
# Ensure input_ids is a list of integers
|
302 |
if 'input_ids' in feature:
|
303 |
+
# If input_ids is a string, tokenize it
|
304 |
+
if isinstance(feature['input_ids'], str) and self.tokenizer:
|
305 |
+
logger.warning("Found string input_ids in dataset. Tokenizing as fallback.")
|
306 |
+
feature['input_ids'] = self.tokenizer.encode(feature['input_ids'], add_special_tokens=False)
|
|
|
307 |
# If input_ids is not a list, convert it
|
308 |
elif not isinstance(feature['input_ids'], list):
|
309 |
try:
|
|
|
602 |
logger.info("Successfully applied LoRA with standard PEFT")
|
603 |
|
604 |
# No need to format the dataset - it's already pre-tokenized
|
605 |
+
logger.info("Using dataset with flexible tokenization handling")
|
606 |
+
logger.info("Will use pre-tokenized data if available, or tokenize strings as fallback")
|
607 |
training_dataset = dataset
|
608 |
|
609 |
# Configure reporting backends with fallbacks
|