Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +95 -67
run_cloud_training.py
CHANGED
@@ -21,12 +21,14 @@ from transformers.data.data_collator import DataCollatorMixin
|
|
21 |
from peft import LoraConfig
|
22 |
from unsloth import FastLanguageModel
|
23 |
|
|
|
|
|
|
|
|
|
|
|
24 |
# Configure PyTorch memory allocator for better memory management
|
25 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
26 |
|
27 |
-
# Disable flash attention globally
|
28 |
-
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
29 |
-
|
30 |
# Configure logging first
|
31 |
logging.basicConfig(
|
32 |
level=logging.INFO,
|
@@ -211,7 +213,7 @@ class PreTokenizedCollator(DataCollatorMixin):
|
|
211 |
"""
|
212 |
def __init__(self, pad_token_id=0, tokenizer=None):
|
213 |
self.pad_token_id = pad_token_id
|
214 |
-
self.tokenizer = tokenizer # Keep a reference to the tokenizer for
|
215 |
|
216 |
def __call__(self, features):
|
217 |
# Print a sample feature to understand structure
|
@@ -221,66 +223,73 @@ class PreTokenizedCollator(DataCollatorMixin):
|
|
221 |
# Extract input_ids from conversations if needed
|
222 |
processed_features = []
|
223 |
for feature in features:
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
# If input_ids is not directly available, try to extract from conversations
|
225 |
if 'input_ids' not in feature and 'conversations' in feature:
|
226 |
# Extract from conversations based on your dataset structure
|
227 |
conversations = feature['conversations']
|
228 |
|
229 |
-
# Debug the conversations structure
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
|
235 |
# Try different approaches to extract input_ids
|
236 |
if isinstance(conversations, list) and len(conversations) > 0:
|
237 |
-
# Case 1: If conversations is a list of dicts with '
|
238 |
-
if isinstance(conversations[0], dict) and '
|
239 |
-
content = conversations[0]['content']
|
240 |
-
logger.info(f"Found content field: {type(content)}")
|
241 |
-
|
242 |
-
# If content is a string, tokenize it
|
243 |
-
if isinstance(content, str) and self.tokenizer:
|
244 |
-
logger.info(f"Tokenizing string content: {content[:50]}...")
|
245 |
-
feature['input_ids'] = self.tokenizer.encode(content, add_special_tokens=False)
|
246 |
-
# If content is already a list of integers, use it directly
|
247 |
-
elif isinstance(content, list) and all(isinstance(x, int) for x in content):
|
248 |
-
feature['input_ids'] = content
|
249 |
-
# If content is already tokenized in some other format
|
250 |
-
else:
|
251 |
-
logger.warning(f"Unexpected content format: {type(content)}")
|
252 |
-
|
253 |
-
# Case 2: If conversations is a list of dicts with 'input_ids' field
|
254 |
-
elif isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
|
255 |
feature['input_ids'] = conversations[0]['input_ids']
|
256 |
|
257 |
-
# Case
|
258 |
elif all(isinstance(x, int) for x in conversations):
|
259 |
feature['input_ids'] = conversations
|
260 |
|
261 |
-
# Case
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
|
267 |
# Ensure input_ids is a list of integers
|
268 |
if 'input_ids' in feature:
|
269 |
-
#
|
270 |
-
if isinstance(feature['input_ids'], str)
|
271 |
-
logger.
|
272 |
-
|
|
|
273 |
# If input_ids is not a list, convert it
|
274 |
elif not isinstance(feature['input_ids'], list):
|
275 |
try:
|
276 |
feature['input_ids'] = list(feature['input_ids'])
|
277 |
except:
|
278 |
logger.error(f"Could not convert input_ids to list: {type(feature['input_ids'])}")
|
|
|
|
|
|
|
|
|
279 |
|
280 |
processed_features.append(feature)
|
281 |
|
282 |
# If we still don't have input_ids, log an error
|
283 |
-
if len(processed_features)
|
|
|
|
|
|
|
|
|
284 |
logger.error(f"Could not find input_ids in features. Available keys: {list(processed_features[0].keys())}")
|
285 |
if 'conversations' in processed_features[0]:
|
286 |
logger.error(f"Conversations structure: {processed_features[0]['conversations'][:1]}")
|
@@ -344,6 +353,11 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
|
|
344 |
"""
|
345 |
global flash_attention_available
|
346 |
|
|
|
|
|
|
|
|
|
|
|
347 |
try:
|
348 |
logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
|
349 |
|
@@ -364,37 +378,42 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
|
|
364 |
model_name=model_name,
|
365 |
max_seq_length=max_seq_length,
|
366 |
dtype=dtype,
|
367 |
-
quantization_config=bnb_config
|
|
|
368 |
)
|
369 |
logger.info("Model loaded successfully with unsloth")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
return model, tokenizer
|
371 |
|
372 |
except Exception as e:
|
373 |
logger.warning(f"Unsloth loading failed: {e}")
|
374 |
logger.info("Falling back to standard Hugging Face loading...")
|
375 |
|
376 |
-
# We'll try
|
377 |
-
attn_params = {}
|
378 |
-
|
379 |
-
# If flash attention is available, try to use it
|
380 |
-
if flash_attention_available:
|
381 |
-
logger.info("Flash Attention is available - setting appropriate parameters")
|
382 |
-
# For newer models that support attn_implementation parameter
|
383 |
-
attn_params = {"attn_implementation": "eager"} # Default to eager for compatibility
|
384 |
-
|
385 |
-
# Try to use flash attention if available
|
386 |
-
try:
|
387 |
-
# Try importing flash attention to confirm it's available
|
388 |
-
import flash_attn
|
389 |
-
logger.info(f"Using Flash Attention version {flash_attn.__version__}")
|
390 |
-
attn_params = {"attn_implementation": "flash_attention_2"}
|
391 |
-
except Exception as flash_error:
|
392 |
-
logger.warning(f"Flash Attention import failed: {flash_error}")
|
393 |
|
394 |
# Approach 1: Using attn_implementation parameter (newer method)
|
395 |
try:
|
396 |
logger.info(f"Trying HF loading with attention parameters: {attn_params}")
|
397 |
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
399 |
|
400 |
# The proper way to set attention implementation in newer transformers
|
@@ -416,6 +435,15 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
|
|
416 |
|
417 |
# Approach 2: Complete fallback with minimal parameters
|
418 |
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
420 |
|
421 |
# Most basic loading without any attention parameters
|
@@ -447,19 +475,19 @@ def train(config_path, dataset_name, output_dir):
|
|
447 |
lora_config = config.get("lora_config", {})
|
448 |
dataset_config = config.get("dataset_config", {})
|
449 |
|
450 |
-
#
|
|
|
|
|
|
|
|
|
|
|
451 |
global flash_attention_available
|
452 |
-
|
453 |
-
|
454 |
-
# If flash attention is available, set attn_implementation to flash_attention_2
|
455 |
-
hardware_config["attn_implementation"] = "flash_attention_2"
|
456 |
-
else:
|
457 |
-
logger.info("Flash Attention not available - setting to eager attention")
|
458 |
-
hardware_config["attn_implementation"] = "eager"
|
459 |
|
460 |
-
#
|
461 |
-
|
462 |
-
|
463 |
|
464 |
# Verify this is training phase only
|
465 |
training_phase_only = dataset_config.get("training_phase_only", True)
|
|
|
21 |
from peft import LoraConfig
|
22 |
from unsloth import FastLanguageModel
|
23 |
|
24 |
+
# Disable all attention optimizations that might cause issues
|
25 |
+
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
26 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
27 |
+
os.environ["XFORMERS_DISABLED"] = "1"
|
28 |
+
|
29 |
# Configure PyTorch memory allocator for better memory management
|
30 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
31 |
|
|
|
|
|
|
|
32 |
# Configure logging first
|
33 |
logging.basicConfig(
|
34 |
level=logging.INFO,
|
|
|
213 |
"""
|
214 |
def __init__(self, pad_token_id=0, tokenizer=None):
|
215 |
self.pad_token_id = pad_token_id
|
216 |
+
self.tokenizer = tokenizer # Keep a reference to the tokenizer for debugging only
|
217 |
|
218 |
def __call__(self, features):
|
219 |
# Print a sample feature to understand structure
|
|
|
223 |
# Extract input_ids from conversations if needed
|
224 |
processed_features = []
|
225 |
for feature in features:
|
226 |
+
# If input_ids is directly available, use it without tokenization
|
227 |
+
if 'input_ids' in feature and isinstance(feature['input_ids'], list):
|
228 |
+
# Already tokenized, no processing needed
|
229 |
+
processed_features.append(feature)
|
230 |
+
continue
|
231 |
+
|
232 |
# If input_ids is not directly available, try to extract from conversations
|
233 |
if 'input_ids' not in feature and 'conversations' in feature:
|
234 |
# Extract from conversations based on your dataset structure
|
235 |
conversations = feature['conversations']
|
236 |
|
237 |
+
# Debug the conversations structure (only for first batch)
|
238 |
+
if len(processed_features) == 0:
|
239 |
+
logger.info(f"Conversations type: {type(conversations)}")
|
240 |
+
if isinstance(conversations, list) and len(conversations) > 0:
|
241 |
+
logger.info(f"First conversation type: {type(conversations[0])}")
|
242 |
|
243 |
# Try different approaches to extract input_ids
|
244 |
if isinstance(conversations, list) and len(conversations) > 0:
|
245 |
+
# Case 1: If conversations is a list of dicts with 'input_ids' field (pre-tokenized)
|
246 |
+
if isinstance(conversations[0], dict) and 'input_ids' in conversations[0]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
feature['input_ids'] = conversations[0]['input_ids']
|
248 |
|
249 |
+
# Case 2: If conversations itself contains the input_ids (pre-tokenized)
|
250 |
elif all(isinstance(x, int) for x in conversations):
|
251 |
feature['input_ids'] = conversations
|
252 |
|
253 |
+
# Case 3: If conversations is a list of dicts with 'content' field
|
254 |
+
# This should be avoided for pre-tokenized datasets
|
255 |
+
elif isinstance(conversations[0], dict) and 'content' in conversations[0]:
|
256 |
+
content = conversations[0]['content']
|
257 |
+
|
258 |
+
# If content is already a list of integers, use it directly
|
259 |
+
if isinstance(content, list) and all(isinstance(x, int) for x in content):
|
260 |
+
feature['input_ids'] = content
|
261 |
+
# AVOID TOKENIZATION: Log warning if content is a string
|
262 |
+
elif isinstance(content, str):
|
263 |
+
logger.warning("Found string content in pre-tokenized dataset. This should not happen.")
|
264 |
+
logger.warning("Skipping this example to avoid tokenization.")
|
265 |
+
continue
|
266 |
|
267 |
# Ensure input_ids is a list of integers
|
268 |
if 'input_ids' in feature:
|
269 |
+
# AVOID TOKENIZATION: Skip string input_ids
|
270 |
+
if isinstance(feature['input_ids'], str):
|
271 |
+
logger.warning("Found string input_ids in pre-tokenized dataset. This should not happen.")
|
272 |
+
logger.warning("Skipping this example to avoid tokenization.")
|
273 |
+
continue
|
274 |
# If input_ids is not a list, convert it
|
275 |
elif not isinstance(feature['input_ids'], list):
|
276 |
try:
|
277 |
feature['input_ids'] = list(feature['input_ids'])
|
278 |
except:
|
279 |
logger.error(f"Could not convert input_ids to list: {type(feature['input_ids'])}")
|
280 |
+
continue
|
281 |
+
else:
|
282 |
+
logger.warning("No input_ids found in this example. Skipping.")
|
283 |
+
continue
|
284 |
|
285 |
processed_features.append(feature)
|
286 |
|
287 |
# If we still don't have input_ids, log an error
|
288 |
+
if len(processed_features) == 0:
|
289 |
+
logger.error("No valid examples found in batch. Check dataset format.")
|
290 |
+
raise ValueError("No valid examples found. Please check dataset structure.")
|
291 |
+
|
292 |
+
if 'input_ids' not in processed_features[0]:
|
293 |
logger.error(f"Could not find input_ids in features. Available keys: {list(processed_features[0].keys())}")
|
294 |
if 'conversations' in processed_features[0]:
|
295 |
logger.error(f"Conversations structure: {processed_features[0]['conversations'][:1]}")
|
|
|
353 |
"""
|
354 |
global flash_attention_available
|
355 |
|
356 |
+
# Force disable flash attention and xformers
|
357 |
+
flash_attention_available = False
|
358 |
+
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
359 |
+
os.environ["XFORMERS_DISABLED"] = "1"
|
360 |
+
|
361 |
try:
|
362 |
logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
|
363 |
|
|
|
378 |
model_name=model_name,
|
379 |
max_seq_length=max_seq_length,
|
380 |
dtype=dtype,
|
381 |
+
quantization_config=bnb_config,
|
382 |
+
attn_implementation="eager" # Force eager attention
|
383 |
)
|
384 |
logger.info("Model loaded successfully with unsloth")
|
385 |
+
|
386 |
+
# Explicitly disable flash attention in model config
|
387 |
+
if hasattr(model, 'config'):
|
388 |
+
if hasattr(model.config, 'attn_implementation'):
|
389 |
+
model.config.attn_implementation = "eager"
|
390 |
+
if hasattr(model.config, 'use_flash_attention'):
|
391 |
+
model.config.use_flash_attention = False
|
392 |
+
if hasattr(model.config, 'use_flash_attention_2'):
|
393 |
+
model.config.use_flash_attention_2 = False
|
394 |
+
|
395 |
return model, tokenizer
|
396 |
|
397 |
except Exception as e:
|
398 |
logger.warning(f"Unsloth loading failed: {e}")
|
399 |
logger.info("Falling back to standard Hugging Face loading...")
|
400 |
|
401 |
+
# We'll try with HF loading
|
402 |
+
attn_params = {"attn_implementation": "eager"} # Always use eager
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
|
404 |
# Approach 1: Using attn_implementation parameter (newer method)
|
405 |
try:
|
406 |
logger.info(f"Trying HF loading with attention parameters: {attn_params}")
|
407 |
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
408 |
+
|
409 |
+
# Disable flash attention in config
|
410 |
+
if hasattr(config, 'attn_implementation'):
|
411 |
+
config.attn_implementation = "eager"
|
412 |
+
if hasattr(config, 'use_flash_attention'):
|
413 |
+
config.use_flash_attention = False
|
414 |
+
if hasattr(config, 'use_flash_attention_2'):
|
415 |
+
config.use_flash_attention_2 = False
|
416 |
+
|
417 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
418 |
|
419 |
# The proper way to set attention implementation in newer transformers
|
|
|
435 |
|
436 |
# Approach 2: Complete fallback with minimal parameters
|
437 |
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
438 |
+
|
439 |
+
# Disable flash attention in config
|
440 |
+
if hasattr(config, 'attn_implementation'):
|
441 |
+
config.attn_implementation = "eager"
|
442 |
+
if hasattr(config, 'use_flash_attention'):
|
443 |
+
config.use_flash_attention = False
|
444 |
+
if hasattr(config, 'use_flash_attention_2'):
|
445 |
+
config.use_flash_attention_2 = False
|
446 |
+
|
447 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
448 |
|
449 |
# Most basic loading without any attention parameters
|
|
|
475 |
lora_config = config.get("lora_config", {})
|
476 |
dataset_config = config.get("dataset_config", {})
|
477 |
|
478 |
+
# Force disable flash attention and xformers
|
479 |
+
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
480 |
+
os.environ["XFORMERS_DISABLED"] = "1"
|
481 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
482 |
+
|
483 |
+
# Update flash attention setting to always use eager
|
484 |
global flash_attention_available
|
485 |
+
flash_attention_available = False
|
486 |
+
logger.info("Flash Attention has been DISABLED globally")
|
|
|
|
|
|
|
|
|
|
|
487 |
|
488 |
+
# Update hardware config to ensure eager attention
|
489 |
+
hardware_config["attn_implementation"] = "eager"
|
490 |
+
hardware_config["use_flash_attention"] = False
|
491 |
|
492 |
# Verify this is training phase only
|
493 |
training_phase_only = dataset_config.get("training_phase_only", True)
|