Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

6b2c2bc

verified ·

1 Parent(s): 00a06ef

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +50 -16

run_cloud_training.py CHANGED Viewed

@@ -135,7 +135,7 @@ def load_and_prepare_dataset(dataset_name, config):
     """
     Load and prepare the dataset for fine-tuning.
     Sort entries by prompt_number as required.
-    NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
     """
     # Use the default dataset path if no specific path is provided
     if dataset_name == "phi4-cognitive-dataset":
@@ -182,9 +182,35 @@ def load_and_prepare_dataset(dataset_name, config):
         if len(dataset) > 0:
             sample = dataset[0]
             logger.info(f"Sample entry structure: {list(sample.keys())}")
-            if 'conversations' in sample:
                 logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
         return dataset
     except Exception as e:
@@ -208,12 +234,12 @@ def tokenize_string(text, tokenizer):
 # Data collator for pre-tokenized dataset
 class PreTokenizedCollator(DataCollatorMixin):
     """
-    Data collator for pre-tokenized datasets.
-    Expects input_ids and labels already tokenized.
     """
     def __init__(self, pad_token_id=0, tokenizer=None):
         self.pad_token_id = pad_token_id
-        self.tokenizer = tokenizer  # Keep a reference to the tokenizer for debugging only
     def __call__(self, features):
         # Print a sample feature to understand structure
@@ -251,26 +277,33 @@ class PreTokenizedCollator(DataCollatorMixin):
                         feature['input_ids'] = conversations
                     # Case 3: If conversations is a list of dicts with 'content' field
-                    # This should be avoided for pre-tokenized datasets
                     elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
                         content = conversations[0]['content']
                         # If content is already a list of integers, use it directly
                         if isinstance(content, list) and all(isinstance(x, int) for x in content):
                             feature['input_ids'] = content
-                        # AVOID TOKENIZATION: Log warning if content is a string
-                        elif isinstance(content, str):
-                            logger.warning("Found string content in pre-tokenized dataset. This should not happen.")
-                            logger.warning("Skipping this example to avoid tokenization.")
                             continue
             # Ensure input_ids is a list of integers
             if 'input_ids' in feature:
-                # AVOID TOKENIZATION: Skip string input_ids
-                if isinstance(feature['input_ids'], str):
-                    logger.warning("Found string input_ids in pre-tokenized dataset. This should not happen.")
-                    logger.warning("Skipping this example to avoid tokenization.")
-                    continue
                 # If input_ids is not a list, convert it
                 elif not isinstance(feature['input_ids'], list):
                     try:
@@ -569,7 +602,8 @@ def train(config_path, dataset_name, output_dir):
         logger.info("Successfully applied LoRA with standard PEFT")
         # No need to format the dataset - it's already pre-tokenized
-        logger.info("Using pre-tokenized dataset - skipping tokenization step")
         training_dataset = dataset
         # Configure reporting backends with fallbacks

     """
     Load and prepare the dataset for fine-tuning.
     Sort entries by prompt_number as required.
+    Handles both pre-tokenized and string content.
     """
     # Use the default dataset path if no specific path is provided
     if dataset_name == "phi4-cognitive-dataset":
         if len(dataset) > 0:
             sample = dataset[0]
             logger.info(f"Sample entry structure: {list(sample.keys())}")
+            # Check if dataset is pre-tokenized or contains string content
+            is_pre_tokenized = False
+            if 'input_ids' in sample and isinstance(sample['input_ids'], list) and all(isinstance(x, int) for x in sample['input_ids']):
+                logger.info("Dataset appears to be pre-tokenized with input_ids field")
+                is_pre_tokenized = True
+            elif 'conversations' in sample:
                 logger.info(f"Sample conversations structure: {sample['conversations'][:1]}")
+                # Check if conversations contain pre-tokenized data
+                if isinstance(sample['conversations'], list) and len(sample['conversations']) > 0:
+                    conv = sample['conversations'][0]
+                    if isinstance(conv, dict) and 'input_ids' in conv and isinstance(conv['input_ids'], list):
+                        logger.info("Dataset appears to be pre-tokenized in conversations.input_ids")
+                        is_pre_tokenized = True
+                    elif isinstance(conv, dict) and 'content' in conv:
+                        content = conv['content']
+                        if isinstance(content, list) and all(isinstance(x, int) for x in content):
+                            logger.info("Dataset appears to be pre-tokenized in conversations.content")
+                            is_pre_tokenized = True
+                        else:
+                            logger.info("Dataset appears to contain string content that will need tokenization")
+            if is_pre_tokenized:
+                logger.info("Using pre-tokenized dataset - tokenizer will only be used as fallback")
+            else:
+                logger.info("Dataset contains string content - tokenizer will be used")
         return dataset
     except Exception as e:
 # Data collator for pre-tokenized dataset
 class PreTokenizedCollator(DataCollatorMixin):
     """
+    Data collator that can handle both pre-tokenized datasets and string content.
+    Will tokenize strings if necessary, but logs warnings.
     """
     def __init__(self, pad_token_id=0, tokenizer=None):
         self.pad_token_id = pad_token_id
+        self.tokenizer = tokenizer  # Keep a reference to the tokenizer for fallback tokenization
     def __call__(self, features):
         # Print a sample feature to understand structure
                         feature['input_ids'] = conversations
                     # Case 3: If conversations is a list of dicts with 'content' field
                     elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
                         content = conversations[0]['content']
                         # If content is already a list of integers, use it directly
                         if isinstance(content, list) and all(isinstance(x, int) for x in content):
                             feature['input_ids'] = content
+                        # If content is a string, tokenize it with a warning
+                        elif isinstance(content, str) and self.tokenizer:
+                            logger.warning("Found string content in dataset. Tokenizing as fallback.")
+                            feature['input_ids'] = self.tokenizer.encode(content, add_special_tokens=False)
+                        else:
+                            logger.warning(f"Unexpected content format: {type(content)}")
                             continue
+                    # Case 4: If conversations is a list of strings
+                    elif all(isinstance(x, str) for x in conversations) and self.tokenizer:
+                        # Join all strings and tokenize
+                        logger.warning("Found string conversations in dataset. Tokenizing as fallback.")
+                        full_text = " ".join(conversations)
+                        feature['input_ids'] = self.tokenizer.encode(full_text, add_special_tokens=False)
             # Ensure input_ids is a list of integers
             if 'input_ids' in feature:
+                # If input_ids is a string, tokenize it
+                if isinstance(feature['input_ids'], str) and self.tokenizer:
+                    logger.warning("Found string input_ids in dataset. Tokenizing as fallback.")
+                    feature['input_ids'] = self.tokenizer.encode(feature['input_ids'], add_special_tokens=False)
                 # If input_ids is not a list, convert it
                 elif not isinstance(feature['input_ids'], list):
                     try:
         logger.info("Successfully applied LoRA with standard PEFT")
         # No need to format the dataset - it's already pre-tokenized
+        logger.info("Using dataset with flexible tokenization handling")
+        logger.info("Will use pre-tokenized data if available, or tokenize strings as fallback")
         training_dataset = dataset
         # Configure reporting backends with fallbacks