George-API commited on
Commit
862c3c6
·
verified ·
1 Parent(s): c58ed8b

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +30 -9
run_cloud_training.py CHANGED
@@ -412,6 +412,9 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
412
  os.environ["XFORMERS_DISABLED"] = "1"
413
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
414
 
 
 
 
415
  # Create BitsAndBytesConfig for 4-bit quantization
416
  from transformers import BitsAndBytesConfig
417
  bnb_config = BitsAndBytesConfig(
@@ -428,6 +431,10 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
428
  # Skip Unsloth and use standard HuggingFace loading
429
  logger.info("Bypassing Unsloth optimizations to avoid memory-efficient attention issues")
430
 
 
 
 
 
431
  # Load with standard HuggingFace
432
  config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
433
 
@@ -442,10 +449,14 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
442
 
443
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
444
 
 
 
 
 
445
  model = AutoModelForCausalLM.from_pretrained(
446
  model_name,
447
  config=config,
448
- device_map="auto",
449
  torch_dtype=dtype or torch.float16,
450
  quantization_config=bnb_config,
451
  trust_remote_code=True,
@@ -465,6 +476,9 @@ def train(config_path, dataset_name, output_dir):
465
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
466
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
467
 
 
 
 
468
  # Try to unload xformers if it's loaded
469
  if 'xformers' in sys.modules:
470
  logger.info("Removing xformers from sys.modules")
@@ -510,6 +524,12 @@ def train(config_path, dataset_name, output_dir):
510
  logger.info(f"Output directory: {output_dir}")
511
  logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
512
 
 
 
 
 
 
 
513
  # Load and prepare the dataset
514
  dataset = load_and_prepare_dataset(dataset_name, config)
515
 
@@ -524,9 +544,9 @@ def train(config_path, dataset_name, output_dir):
524
  # Initialize model
525
  logger.info("Initializing model (preserving 4-bit quantization)")
526
 
527
- # Reduce max sequence length to avoid memory issues
528
- max_seq_length = min(training_config.get("max_seq_length", 2048), 1024)
529
- logger.info(f"Using reduced max sequence length: {max_seq_length} to avoid memory issues")
530
 
531
  # Create LoRA config directly
532
  logger.info("Creating LoRA configuration")
@@ -582,10 +602,10 @@ def train(config_path, dataset_name, output_dir):
582
  reports = ["none"]
583
  logger.warning("No reporting backends available - training metrics won't be logged")
584
 
585
- # Set up training arguments with correct parameters
586
- # REDUCE BATCH SIZE to avoid memory issues with attention
587
- per_device_train_batch_size = 1 # Reduced from default of 2
588
- logger.info(f"Using reduced batch size: {per_device_train_batch_size} to avoid memory issues")
589
 
590
  training_args_dict = {
591
  "output_dir": output_dir,
@@ -607,7 +627,8 @@ def train(config_path, dataset_name, output_dir):
607
  "logging_first_step": training_config.get("logging_first_step", True),
608
  "disable_tqdm": training_config.get("disable_tqdm", False),
609
  "remove_unused_columns": False,
610
- "seed": 42
 
611
  }
612
 
613
  # Create TrainingArguments with validated parameters
 
412
  os.environ["XFORMERS_DISABLED"] = "1"
413
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
414
 
415
+ # Configure PyTorch memory allocator for better memory management with multiple GPUs
416
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
417
+
418
  # Create BitsAndBytesConfig for 4-bit quantization
419
  from transformers import BitsAndBytesConfig
420
  bnb_config = BitsAndBytesConfig(
 
431
  # Skip Unsloth and use standard HuggingFace loading
432
  logger.info("Bypassing Unsloth optimizations to avoid memory-efficient attention issues")
433
 
434
+ # Check available GPUs
435
+ gpu_count = torch.cuda.device_count()
436
+ logger.info(f"Found {gpu_count} GPU(s) available")
437
+
438
  # Load with standard HuggingFace
439
  config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
440
 
 
449
 
450
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
451
 
452
+ # Use auto device mapping for multi-GPU setup
453
+ device_map = "auto" if gpu_count > 1 else "auto"
454
+ logger.info(f"Using device_map={device_map} for model distribution")
455
+
456
  model = AutoModelForCausalLM.from_pretrained(
457
  model_name,
458
  config=config,
459
+ device_map=device_map,
460
  torch_dtype=dtype or torch.float16,
461
  quantization_config=bnb_config,
462
  trust_remote_code=True,
 
476
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
477
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
478
 
479
+ # Configure PyTorch memory allocator for better memory management with multiple GPUs
480
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
481
+
482
  # Try to unload xformers if it's loaded
483
  if 'xformers' in sys.modules:
484
  logger.info("Removing xformers from sys.modules")
 
524
  logger.info(f"Output directory: {output_dir}")
525
  logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
526
 
527
+ # Check GPU availability
528
+ gpu_count = torch.cuda.device_count()
529
+ logger.info(f"Found {gpu_count} GPU(s) available")
530
+ for i in range(gpu_count):
531
+ logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
532
+
533
  # Load and prepare the dataset
534
  dataset = load_and_prepare_dataset(dataset_name, config)
535
 
 
544
  # Initialize model
545
  logger.info("Initializing model (preserving 4-bit quantization)")
546
 
547
+ # Use full sequence length of 2048 as required for pre-tokenized dataset
548
+ max_seq_length = training_config.get("max_seq_length", 2048)
549
+ logger.info(f"Using sequence length: {max_seq_length} as required for pre-tokenized dataset")
550
 
551
  # Create LoRA config directly
552
  logger.info("Creating LoRA configuration")
 
602
  reports = ["none"]
603
  logger.warning("No reporting backends available - training metrics won't be logged")
604
 
605
+ # Optimize batch size for multi-GPU setup
606
+ # For 4x L4 GPUs (24GB each), we can safely use a larger batch size
607
+ per_device_train_batch_size = 4 if gpu_count >= 4 else 2
608
+ logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
609
 
610
  training_args_dict = {
611
  "output_dir": output_dir,
 
627
  "logging_first_step": training_config.get("logging_first_step", True),
628
  "disable_tqdm": training_config.get("disable_tqdm", False),
629
  "remove_unused_columns": False,
630
+ "seed": 42,
631
+ "dataloader_num_workers": 4 # Use multiple workers for data loading
632
  }
633
 
634
  # Create TrainingArguments with validated parameters