Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +60 -7
run_cloud_training.py
CHANGED
@@ -110,14 +110,24 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
110 |
print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
|
111 |
raise
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
# Data collator for pre-tokenized dataset
|
114 |
class PreTokenizedCollator(DataCollatorMixin):
|
115 |
"""
|
116 |
Data collator for pre-tokenized datasets.
|
117 |
Expects input_ids and labels already tokenized.
|
118 |
"""
|
119 |
-
def __init__(self, pad_token_id=0):
|
120 |
self.pad_token_id = pad_token_id
|
|
|
121 |
|
122 |
def __call__(self, features):
|
123 |
# Print a sample feature to understand structure
|
@@ -130,15 +140,58 @@ class PreTokenizedCollator(DataCollatorMixin):
|
|
130 |
# If input_ids is not directly available, try to extract from conversations
|
131 |
if 'input_ids' not in feature and 'conversations' in feature:
|
132 |
# Extract from conversations based on your dataset structure
|
133 |
-
# This is a placeholder - adjust based on actual structure
|
134 |
conversations = feature['conversations']
|
|
|
|
|
|
|
135 |
if isinstance(conversations, list) and len(conversations) > 0:
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
|
141 |
feature['input_ids'] = conversations[0]['input_ids']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
processed_features.append(feature)
|
144 |
|
@@ -380,7 +433,7 @@ def train(config_path, dataset_name, output_dir):
|
|
380 |
model=model,
|
381 |
args=training_args,
|
382 |
train_dataset=training_dataset,
|
383 |
-
data_collator=PreTokenizedCollator(pad_token_id=tokenizer.pad_token_id),
|
384 |
)
|
385 |
|
386 |
# Start training
|
|
|
110 |
print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
|
111 |
raise
|
112 |
|
113 |
+
def tokenize_string(text, tokenizer):
|
114 |
+
"""Tokenize a string using the provided tokenizer"""
|
115 |
+
if not text:
|
116 |
+
return []
|
117 |
+
|
118 |
+
# Tokenize the text
|
119 |
+
tokens = tokenizer.encode(text, add_special_tokens=False)
|
120 |
+
return tokens
|
121 |
+
|
122 |
# Data collator for pre-tokenized dataset
|
123 |
class PreTokenizedCollator(DataCollatorMixin):
|
124 |
"""
|
125 |
Data collator for pre-tokenized datasets.
|
126 |
Expects input_ids and labels already tokenized.
|
127 |
"""
|
128 |
+
def __init__(self, pad_token_id=0, tokenizer=None):
|
129 |
self.pad_token_id = pad_token_id
|
130 |
+
self.tokenizer = tokenizer # Keep a reference to the tokenizer for string conversion
|
131 |
|
132 |
def __call__(self, features):
|
133 |
# Print a sample feature to understand structure
|
|
|
140 |
# If input_ids is not directly available, try to extract from conversations
|
141 |
if 'input_ids' not in feature and 'conversations' in feature:
|
142 |
# Extract from conversations based on your dataset structure
|
|
|
143 |
conversations = feature['conversations']
|
144 |
+
|
145 |
+
# Debug the conversations structure
|
146 |
+
logger.info(f"Conversations type: {type(conversations)}")
|
147 |
if isinstance(conversations, list) and len(conversations) > 0:
|
148 |
+
logger.info(f"First conversation type: {type(conversations[0])}")
|
149 |
+
logger.info(f"First conversation: {conversations[0]}")
|
150 |
+
|
151 |
+
# Try different approaches to extract input_ids
|
152 |
+
if isinstance(conversations, list) and len(conversations) > 0:
|
153 |
+
# Case 1: If conversations is a list of dicts with 'content' field
|
154 |
+
if isinstance(conversations[0], dict) and 'content' in conversations[0]:
|
155 |
+
content = conversations[0]['content']
|
156 |
+
logger.info(f"Found content field: {type(content)}")
|
157 |
+
|
158 |
+
# If content is a string, tokenize it
|
159 |
+
if isinstance(content, str) and self.tokenizer:
|
160 |
+
logger.info(f"Tokenizing string content: {content[:50]}...")
|
161 |
+
feature['input_ids'] = self.tokenizer.encode(content, add_special_tokens=False)
|
162 |
+
# If content is already a list of integers, use it directly
|
163 |
+
elif isinstance(content, list) and all(isinstance(x, int) for x in content):
|
164 |
+
feature['input_ids'] = content
|
165 |
+
# If content is already tokenized in some other format
|
166 |
+
else:
|
167 |
+
logger.warning(f"Unexpected content format: {type(content)}")
|
168 |
+
|
169 |
+
# Case 2: If conversations is a list of dicts with 'input_ids' field
|
170 |
elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
|
171 |
feature['input_ids'] = conversations[0]['input_ids']
|
172 |
+
|
173 |
+
# Case 3: If conversations itself contains the input_ids
|
174 |
+
elif all(isinstance(x, int) for x in conversations):
|
175 |
+
feature['input_ids'] = conversations
|
176 |
+
|
177 |
+
# Case 4: If conversations is a list of strings
|
178 |
+
elif all(isinstance(x, str) for x in conversations) and self.tokenizer:
|
179 |
+
# Join all strings and tokenize
|
180 |
+
full_text = " ".join(conversations)
|
181 |
+
feature['input_ids'] = self.tokenizer.encode(full_text, add_special_tokens=False)
|
182 |
+
|
183 |
+
# Ensure input_ids is a list of integers
|
184 |
+
if 'input_ids' in feature:
|
185 |
+
# If input_ids is a string, tokenize it
|
186 |
+
if isinstance(feature['input_ids'], str) and self.tokenizer:
|
187 |
+
logger.info(f"Converting string input_ids to tokens: {feature['input_ids'][:50]}...")
|
188 |
+
feature['input_ids'] = self.tokenizer.encode(feature['input_ids'], add_special_tokens=False)
|
189 |
+
# If input_ids is not a list, convert it
|
190 |
+
elif not isinstance(feature['input_ids'], list):
|
191 |
+
try:
|
192 |
+
feature['input_ids'] = list(feature['input_ids'])
|
193 |
+
except:
|
194 |
+
logger.error(f"Could not convert input_ids to list: {type(feature['input_ids'])}")
|
195 |
|
196 |
processed_features.append(feature)
|
197 |
|
|
|
433 |
model=model,
|
434 |
args=training_args,
|
435 |
train_dataset=training_dataset,
|
436 |
+
data_collator=PreTokenizedCollator(pad_token_id=tokenizer.pad_token_id, tokenizer=tokenizer),
|
437 |
)
|
438 |
|
439 |
# Start training
|