George-API commited on
Commit
60950b2
·
verified ·
1 Parent(s): 41f3c3b

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +60 -7
run_cloud_training.py CHANGED
@@ -110,14 +110,24 @@ def load_and_prepare_dataset(dataset_name, config):
110
  print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
111
  raise
112
 
 
 
 
 
 
 
 
 
 
113
  # Data collator for pre-tokenized dataset
114
  class PreTokenizedCollator(DataCollatorMixin):
115
  """
116
  Data collator for pre-tokenized datasets.
117
  Expects input_ids and labels already tokenized.
118
  """
119
- def __init__(self, pad_token_id=0):
120
  self.pad_token_id = pad_token_id
 
121
 
122
  def __call__(self, features):
123
  # Print a sample feature to understand structure
@@ -130,15 +140,58 @@ class PreTokenizedCollator(DataCollatorMixin):
130
  # If input_ids is not directly available, try to extract from conversations
131
  if 'input_ids' not in feature and 'conversations' in feature:
132
  # Extract from conversations based on your dataset structure
133
- # This is a placeholder - adjust based on actual structure
134
  conversations = feature['conversations']
 
 
 
135
  if isinstance(conversations, list) and len(conversations) > 0:
136
- # Assuming input_ids might be in the content field
137
- if 'content' in conversations[0]:
138
- feature['input_ids'] = conversations[0]['content']
139
- # Or it might be the conversation itself
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
141
  feature['input_ids'] = conversations[0]['input_ids']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  processed_features.append(feature)
144
 
@@ -380,7 +433,7 @@ def train(config_path, dataset_name, output_dir):
380
  model=model,
381
  args=training_args,
382
  train_dataset=training_dataset,
383
- data_collator=PreTokenizedCollator(pad_token_id=tokenizer.pad_token_id),
384
  )
385
 
386
  # Start training
 
110
  print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
111
  raise
112
 
113
+ def tokenize_string(text, tokenizer):
114
+ """Tokenize a string using the provided tokenizer"""
115
+ if not text:
116
+ return []
117
+
118
+ # Tokenize the text
119
+ tokens = tokenizer.encode(text, add_special_tokens=False)
120
+ return tokens
121
+
122
  # Data collator for pre-tokenized dataset
123
  class PreTokenizedCollator(DataCollatorMixin):
124
  """
125
  Data collator for pre-tokenized datasets.
126
  Expects input_ids and labels already tokenized.
127
  """
128
+ def __init__(self, pad_token_id=0, tokenizer=None):
129
  self.pad_token_id = pad_token_id
130
+ self.tokenizer = tokenizer # Keep a reference to the tokenizer for string conversion
131
 
132
  def __call__(self, features):
133
  # Print a sample feature to understand structure
 
140
  # If input_ids is not directly available, try to extract from conversations
141
  if 'input_ids' not in feature and 'conversations' in feature:
142
  # Extract from conversations based on your dataset structure
 
143
  conversations = feature['conversations']
144
+
145
+ # Debug the conversations structure
146
+ logger.info(f"Conversations type: {type(conversations)}")
147
  if isinstance(conversations, list) and len(conversations) > 0:
148
+ logger.info(f"First conversation type: {type(conversations[0])}")
149
+ logger.info(f"First conversation: {conversations[0]}")
150
+
151
+ # Try different approaches to extract input_ids
152
+ if isinstance(conversations, list) and len(conversations) > 0:
153
+ # Case 1: If conversations is a list of dicts with 'content' field
154
+ if isinstance(conversations[0], dict) and 'content' in conversations[0]:
155
+ content = conversations[0]['content']
156
+ logger.info(f"Found content field: {type(content)}")
157
+
158
+ # If content is a string, tokenize it
159
+ if isinstance(content, str) and self.tokenizer:
160
+ logger.info(f"Tokenizing string content: {content[:50]}...")
161
+ feature['input_ids'] = self.tokenizer.encode(content, add_special_tokens=False)
162
+ # If content is already a list of integers, use it directly
163
+ elif isinstance(content, list) and all(isinstance(x, int) for x in content):
164
+ feature['input_ids'] = content
165
+ # If content is already tokenized in some other format
166
+ else:
167
+ logger.warning(f"Unexpected content format: {type(content)}")
168
+
169
+ # Case 2: If conversations is a list of dicts with 'input_ids' field
170
  elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
171
  feature['input_ids'] = conversations[0]['input_ids']
172
+
173
+ # Case 3: If conversations itself contains the input_ids
174
+ elif all(isinstance(x, int) for x in conversations):
175
+ feature['input_ids'] = conversations
176
+
177
+ # Case 4: If conversations is a list of strings
178
+ elif all(isinstance(x, str) for x in conversations) and self.tokenizer:
179
+ # Join all strings and tokenize
180
+ full_text = " ".join(conversations)
181
+ feature['input_ids'] = self.tokenizer.encode(full_text, add_special_tokens=False)
182
+
183
+ # Ensure input_ids is a list of integers
184
+ if 'input_ids' in feature:
185
+ # If input_ids is a string, tokenize it
186
+ if isinstance(feature['input_ids'], str) and self.tokenizer:
187
+ logger.info(f"Converting string input_ids to tokens: {feature['input_ids'][:50]}...")
188
+ feature['input_ids'] = self.tokenizer.encode(feature['input_ids'], add_special_tokens=False)
189
+ # If input_ids is not a list, convert it
190
+ elif not isinstance(feature['input_ids'], list):
191
+ try:
192
+ feature['input_ids'] = list(feature['input_ids'])
193
+ except:
194
+ logger.error(f"Could not convert input_ids to list: {type(feature['input_ids'])}")
195
 
196
  processed_features.append(feature)
197
 
 
433
  model=model,
434
  args=training_args,
435
  train_dataset=training_dataset,
436
+ data_collator=PreTokenizedCollator(pad_token_id=tokenizer.pad_token_id, tokenizer=tokenizer),
437
  )
438
 
439
  # Start training