Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +30 -9
run_cloud_training.py
CHANGED
@@ -412,6 +412,9 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
412 |
os.environ["XFORMERS_DISABLED"] = "1"
|
413 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
414 |
|
|
|
|
|
|
|
415 |
# Create BitsAndBytesConfig for 4-bit quantization
|
416 |
from transformers import BitsAndBytesConfig
|
417 |
bnb_config = BitsAndBytesConfig(
|
@@ -428,6 +431,10 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
428 |
# Skip Unsloth and use standard HuggingFace loading
|
429 |
logger.info("Bypassing Unsloth optimizations to avoid memory-efficient attention issues")
|
430 |
|
|
|
|
|
|
|
|
|
431 |
# Load with standard HuggingFace
|
432 |
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
433 |
|
@@ -442,10 +449,14 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
442 |
|
443 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
444 |
|
|
|
|
|
|
|
|
|
445 |
model = AutoModelForCausalLM.from_pretrained(
|
446 |
model_name,
|
447 |
config=config,
|
448 |
-
device_map=
|
449 |
torch_dtype=dtype or torch.float16,
|
450 |
quantization_config=bnb_config,
|
451 |
trust_remote_code=True,
|
@@ -465,6 +476,9 @@ def train(config_path, dataset_name, output_dir):
|
|
465 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
466 |
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
467 |
|
|
|
|
|
|
|
468 |
# Try to unload xformers if it's loaded
|
469 |
if 'xformers' in sys.modules:
|
470 |
logger.info("Removing xformers from sys.modules")
|
@@ -510,6 +524,12 @@ def train(config_path, dataset_name, output_dir):
|
|
510 |
logger.info(f"Output directory: {output_dir}")
|
511 |
logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
|
512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
# Load and prepare the dataset
|
514 |
dataset = load_and_prepare_dataset(dataset_name, config)
|
515 |
|
@@ -524,9 +544,9 @@ def train(config_path, dataset_name, output_dir):
|
|
524 |
# Initialize model
|
525 |
logger.info("Initializing model (preserving 4-bit quantization)")
|
526 |
|
527 |
-
#
|
528 |
-
max_seq_length =
|
529 |
-
logger.info(f"Using
|
530 |
|
531 |
# Create LoRA config directly
|
532 |
logger.info("Creating LoRA configuration")
|
@@ -582,10 +602,10 @@ def train(config_path, dataset_name, output_dir):
|
|
582 |
reports = ["none"]
|
583 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
584 |
|
585 |
-
#
|
586 |
-
#
|
587 |
-
per_device_train_batch_size =
|
588 |
-
logger.info(f"Using
|
589 |
|
590 |
training_args_dict = {
|
591 |
"output_dir": output_dir,
|
@@ -607,7 +627,8 @@ def train(config_path, dataset_name, output_dir):
|
|
607 |
"logging_first_step": training_config.get("logging_first_step", True),
|
608 |
"disable_tqdm": training_config.get("disable_tqdm", False),
|
609 |
"remove_unused_columns": False,
|
610 |
-
"seed": 42
|
|
|
611 |
}
|
612 |
|
613 |
# Create TrainingArguments with validated parameters
|
|
|
412 |
os.environ["XFORMERS_DISABLED"] = "1"
|
413 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
414 |
|
415 |
+
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
416 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
417 |
+
|
418 |
# Create BitsAndBytesConfig for 4-bit quantization
|
419 |
from transformers import BitsAndBytesConfig
|
420 |
bnb_config = BitsAndBytesConfig(
|
|
|
431 |
# Skip Unsloth and use standard HuggingFace loading
|
432 |
logger.info("Bypassing Unsloth optimizations to avoid memory-efficient attention issues")
|
433 |
|
434 |
+
# Check available GPUs
|
435 |
+
gpu_count = torch.cuda.device_count()
|
436 |
+
logger.info(f"Found {gpu_count} GPU(s) available")
|
437 |
+
|
438 |
# Load with standard HuggingFace
|
439 |
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
440 |
|
|
|
449 |
|
450 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
451 |
|
452 |
+
# Use auto device mapping for multi-GPU setup
|
453 |
+
device_map = "auto" if gpu_count > 1 else "auto"
|
454 |
+
logger.info(f"Using device_map={device_map} for model distribution")
|
455 |
+
|
456 |
model = AutoModelForCausalLM.from_pretrained(
|
457 |
model_name,
|
458 |
config=config,
|
459 |
+
device_map=device_map,
|
460 |
torch_dtype=dtype or torch.float16,
|
461 |
quantization_config=bnb_config,
|
462 |
trust_remote_code=True,
|
|
|
476 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
477 |
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
478 |
|
479 |
+
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
480 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
481 |
+
|
482 |
# Try to unload xformers if it's loaded
|
483 |
if 'xformers' in sys.modules:
|
484 |
logger.info("Removing xformers from sys.modules")
|
|
|
524 |
logger.info(f"Output directory: {output_dir}")
|
525 |
logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
|
526 |
|
527 |
+
# Check GPU availability
|
528 |
+
gpu_count = torch.cuda.device_count()
|
529 |
+
logger.info(f"Found {gpu_count} GPU(s) available")
|
530 |
+
for i in range(gpu_count):
|
531 |
+
logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
532 |
+
|
533 |
# Load and prepare the dataset
|
534 |
dataset = load_and_prepare_dataset(dataset_name, config)
|
535 |
|
|
|
544 |
# Initialize model
|
545 |
logger.info("Initializing model (preserving 4-bit quantization)")
|
546 |
|
547 |
+
# Use full sequence length of 2048 as required for pre-tokenized dataset
|
548 |
+
max_seq_length = training_config.get("max_seq_length", 2048)
|
549 |
+
logger.info(f"Using sequence length: {max_seq_length} as required for pre-tokenized dataset")
|
550 |
|
551 |
# Create LoRA config directly
|
552 |
logger.info("Creating LoRA configuration")
|
|
|
602 |
reports = ["none"]
|
603 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
604 |
|
605 |
+
# Optimize batch size for multi-GPU setup
|
606 |
+
# For 4x L4 GPUs (24GB each), we can safely use a larger batch size
|
607 |
+
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
608 |
+
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
609 |
|
610 |
training_args_dict = {
|
611 |
"output_dir": output_dir,
|
|
|
627 |
"logging_first_step": training_config.get("logging_first_step", True),
|
628 |
"disable_tqdm": training_config.get("disable_tqdm", False),
|
629 |
"remove_unused_columns": False,
|
630 |
+
"seed": 42,
|
631 |
+
"dataloader_num_workers": 4 # Use multiple workers for data loading
|
632 |
}
|
633 |
|
634 |
# Create TrainingArguments with validated parameters
|