Twelve2five commited on
Commit
f38c379
·
verified ·
1 Parent(s): 139f757

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -84
app.py CHANGED
@@ -643,56 +643,16 @@ def train_model(
643
 
644
  data_collator = seq2seq_causal_collator
645
 
646
- # --- Define Training Arguments and Initialize Trainer ---
647
- progress(0.65, desc="Setting up training configuration...")
648
-
649
- # Output directories
650
- OUTPUT_TRAINING_DIR = "./llama3-8b-rvq-qlora-finetuned-run"
651
- LOGGING_DIR = "./llama3-8b-rvq-qlora-logs-run"
652
-
653
- # Training parameters - adjusted for 4x T4 GPUs
654
- NUM_EPOCHS = int(epochs)
655
- BATCH_SIZE_PER_DEVICE = int(batch_size) # Smaller per-device batch size to avoid OOM
656
- GRAD_ACCUMULATION_STEPS = int(grad_accum_steps)
657
- LEARNING_RATE = float(learning_rate)
658
- WEIGHT_DECAY = 0.01
659
- WARMUP_RATIO = 0.03
660
- LR_SCHEDULER = "cosine"
661
- OPTIMIZER = "paged_adamw_8bit"
662
-
663
- # Calculate total steps and warmup steps
664
- # Total batch size is now batch_size × num_gpus × grad_accum_steps
665
- total_train_batch_size = BATCH_SIZE_PER_DEVICE * n_gpus * GRAD_ACCUMULATION_STEPS
666
- num_training_steps = math.ceil((len(train_dataset) * NUM_EPOCHS) / total_train_batch_size)
667
- num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
668
-
669
- # Logging/Saving frequency
670
- steps_per_epoch = math.ceil(len(train_dataset) / total_train_batch_size)
671
- LOGGING_STEPS = max(10, steps_per_epoch // 15)
672
- SAVE_STEPS = max(50, steps_per_epoch // 10)
673
-
674
- log.append(f"Dataset size: {len(train_dataset)}")
675
- log.append(f"Number of GPUs: {n_gpus}")
676
- log.append(f"Batch size per device: {BATCH_SIZE_PER_DEVICE}")
677
- log.append(f"Gradient Accumulation steps: {GRAD_ACCUMULATION_STEPS}")
678
- log.append(f"Total train batch size (effective): {total_train_batch_size}")
679
- log.append(f"Total optimization steps: {num_training_steps}")
680
- log.append(f"Warmup steps: {num_warmup_steps}")
681
-
682
- # --- Create DeepSpeed configuration file ---
683
- progress(0.7, desc="Creating DeepSpeed configuration...")
684
- # DeepSpeed ZeRO-3 config optimized for T4 GPUs
685
  ds_config = {
686
  "fp16": {
687
- "enabled": "auto",
688
- "loss_scale": 0,
689
- "loss_scale_window": 1000,
690
- "initial_scale_power": 16,
691
- "hysteresis": 2,
692
- "min_loss_scale": 1
693
  },
694
  "bf16": {
695
- "enabled": "auto"
696
  },
697
  "zero_optimization": {
698
  "stage": 3,
@@ -708,60 +668,88 @@ def train_model(
708
  "contiguous_gradients": True,
709
  "reduce_bucket_size": "auto",
710
  "stage3_prefetch_bucket_size": "auto",
711
- "stage3_param_persistence_threshold": "auto",
712
- "gather_16bit_weights_on_model_save": True,
713
- "stage3_max_live_parameters": 1e9,
714
- "stage3_max_reuse_distance": 1e9
715
  },
716
- "gradient_accumulation_steps": GRAD_ACCUMULATION_STEPS,
717
- "gradient_clipping": "auto",
718
- "steps_per_print": 10,
719
- "train_batch_size": "auto",
720
- "train_micro_batch_size_per_gpu": "auto",
721
- "wall_clock_breakdown": False
722
  }
723
 
 
724
  with open("ds_config.json", "w") as f:
725
  json.dump(ds_config, f, indent=4)
726
 
727
- # Configure for multi-GPU training using DeepSpeed
 
 
728
  progress(0.75, desc="Setting up training arguments...")
 
 
 
 
729
  training_args = TrainingArguments(
730
- output_dir=OUTPUT_TRAINING_DIR,
731
- num_train_epochs=NUM_EPOCHS,
732
- per_device_train_batch_size=BATCH_SIZE_PER_DEVICE,
733
- gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
734
- optim=OPTIMIZER,
735
- logging_dir=LOGGING_DIR,
736
- logging_strategy="steps",
737
- logging_steps=LOGGING_STEPS,
738
- save_strategy="steps",
739
- save_steps=SAVE_STEPS,
740
- save_total_limit=2,
741
- learning_rate=LEARNING_RATE,
742
- weight_decay=WEIGHT_DECAY,
743
- warmup_steps=num_warmup_steps,
744
- lr_scheduler_type=LR_SCHEDULER,
745
  report_to="tensorboard",
746
  bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
747
  gradient_checkpointing=True,
748
  gradient_checkpointing_kwargs={'use_reentrant': False},
749
 
750
- # Multi-GPU specific settings
751
- deepspeed="ds_config.json",
752
  ddp_find_unused_parameters=False,
753
  )
754
 
755
- # --- Initialize Trainer ---
756
- progress(0.8, desc="Initializing trainer...")
757
- trainer = Trainer(
758
- model=model_to_train,
759
- args=training_args,
760
- train_dataset=train_dataset,
761
- data_collator=data_collator,
762
- )
763
-
764
- log.append("Trainer initialized with DeepSpeed for multi-GPU training.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
 
766
  # --- Start Training ---
767
  # Clear cache before starting
 
643
 
644
  data_collator = seq2seq_causal_collator
645
 
646
+ # --- DeepSpeed Configuration ---
647
+ # Create DeepSpeed config file directly in Python instead of loading from a file
648
+ progress(0.15, desc="Setting up DeepSpeed configuration...")
649
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
650
  ds_config = {
651
  "fp16": {
652
+ "enabled": False
 
 
 
 
 
653
  },
654
  "bf16": {
655
+ "enabled": True
656
  },
657
  "zero_optimization": {
658
  "stage": 3,
 
668
  "contiguous_gradients": True,
669
  "reduce_bucket_size": "auto",
670
  "stage3_prefetch_bucket_size": "auto",
671
+ "stage3_param_persistence_threshold": "auto"
 
 
 
672
  },
673
+ "gradient_accumulation_steps": grad_accum_steps,
674
+ "train_micro_batch_size_per_gpu": batch_size,
675
+ "gradient_clipping": 1.0,
676
+ "steps_per_print": 10
 
 
677
  }
678
 
679
+ # Save the config to a file
680
  with open("ds_config.json", "w") as f:
681
  json.dump(ds_config, f, indent=4)
682
 
683
+ log.append("DeepSpeed configuration created successfully")
684
+
685
+ # --- Training Arguments ---
686
  progress(0.75, desc="Setting up training arguments...")
687
+ output_dir = f"./results_{model_repo_name}"
688
+ os.makedirs(output_dir, exist_ok=True)
689
+
690
+ # Create training arguments without DeepSpeed first
691
  training_args = TrainingArguments(
692
+ output_dir=output_dir,
693
+ num_train_epochs=float(epochs),
694
+ per_device_train_batch_size=batch_size,
695
+ gradient_accumulation_steps=grad_accum_steps,
696
+ learning_rate=learning_rate,
697
+ weight_decay=0.01,
698
+ logging_dir=f"{output_dir}/logs",
699
+ logging_steps=10,
700
+ save_steps=100,
701
+ save_total_limit=3,
702
+ remove_unused_columns=False,
703
+ push_to_hub=False,
704
+ disable_tqdm=False,
705
+ warmup_ratio=0.03,
706
+ lr_scheduler_type="cosine",
707
  report_to="tensorboard",
708
  bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
709
  gradient_checkpointing=True,
710
  gradient_checkpointing_kwargs={'use_reentrant': False},
711
 
712
+ # For multi-GPU - use a different approach for DeepSpeed
 
713
  ddp_find_unused_parameters=False,
714
  )
715
 
716
+ # Now initialize DeepSpeed separately
717
+ if n_gpus > 1:
718
+ log.append("Setting up DeepSpeed for multi-GPU training")
719
+ try:
720
+ import deepspeed
721
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
722
+
723
+ # Modify the trainer to use DeepSpeed
724
+ trainer_kwargs = {
725
+ "model": model_to_train,
726
+ "args": training_args,
727
+ "train_dataset": train_dataset,
728
+ "data_collator": data_collator,
729
+ "deepspeed": ds_config, # Pass the config as a dict
730
+ }
731
+
732
+ trainer = Trainer(**trainer_kwargs)
733
+ log.append("Trainer initialized with DeepSpeed for multi-GPU training")
734
+ except Exception as e:
735
+ log.append(f"Warning: Could not initialize DeepSpeed: {e}")
736
+ # Fallback to standard distributed training
737
+ trainer = Trainer(
738
+ model=model_to_train,
739
+ args=training_args,
740
+ train_dataset=train_dataset,
741
+ data_collator=data_collator,
742
+ )
743
+ log.append("Falling back to standard distributed training")
744
+ else:
745
+ # Single GPU setup
746
+ trainer = Trainer(
747
+ model=model_to_train,
748
+ args=training_args,
749
+ train_dataset=train_dataset,
750
+ data_collator=data_collator,
751
+ )
752
+ log.append("Trainer initialized for single GPU training")
753
 
754
  # --- Start Training ---
755
  # Clear cache before starting