Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -643,56 +643,16 @@ def train_model(
|
|
643 |
|
644 |
data_collator = seq2seq_causal_collator
|
645 |
|
646 |
-
# ---
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
OUTPUT_TRAINING_DIR = "./llama3-8b-rvq-qlora-finetuned-run"
|
651 |
-
LOGGING_DIR = "./llama3-8b-rvq-qlora-logs-run"
|
652 |
-
|
653 |
-
# Training parameters - adjusted for 4x T4 GPUs
|
654 |
-
NUM_EPOCHS = int(epochs)
|
655 |
-
BATCH_SIZE_PER_DEVICE = int(batch_size) # Smaller per-device batch size to avoid OOM
|
656 |
-
GRAD_ACCUMULATION_STEPS = int(grad_accum_steps)
|
657 |
-
LEARNING_RATE = float(learning_rate)
|
658 |
-
WEIGHT_DECAY = 0.01
|
659 |
-
WARMUP_RATIO = 0.03
|
660 |
-
LR_SCHEDULER = "cosine"
|
661 |
-
OPTIMIZER = "paged_adamw_8bit"
|
662 |
-
|
663 |
-
# Calculate total steps and warmup steps
|
664 |
-
# Total batch size is now batch_size × num_gpus × grad_accum_steps
|
665 |
-
total_train_batch_size = BATCH_SIZE_PER_DEVICE * n_gpus * GRAD_ACCUMULATION_STEPS
|
666 |
-
num_training_steps = math.ceil((len(train_dataset) * NUM_EPOCHS) / total_train_batch_size)
|
667 |
-
num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
|
668 |
-
|
669 |
-
# Logging/Saving frequency
|
670 |
-
steps_per_epoch = math.ceil(len(train_dataset) / total_train_batch_size)
|
671 |
-
LOGGING_STEPS = max(10, steps_per_epoch // 15)
|
672 |
-
SAVE_STEPS = max(50, steps_per_epoch // 10)
|
673 |
-
|
674 |
-
log.append(f"Dataset size: {len(train_dataset)}")
|
675 |
-
log.append(f"Number of GPUs: {n_gpus}")
|
676 |
-
log.append(f"Batch size per device: {BATCH_SIZE_PER_DEVICE}")
|
677 |
-
log.append(f"Gradient Accumulation steps: {GRAD_ACCUMULATION_STEPS}")
|
678 |
-
log.append(f"Total train batch size (effective): {total_train_batch_size}")
|
679 |
-
log.append(f"Total optimization steps: {num_training_steps}")
|
680 |
-
log.append(f"Warmup steps: {num_warmup_steps}")
|
681 |
-
|
682 |
-
# --- Create DeepSpeed configuration file ---
|
683 |
-
progress(0.7, desc="Creating DeepSpeed configuration...")
|
684 |
-
# DeepSpeed ZeRO-3 config optimized for T4 GPUs
|
685 |
ds_config = {
|
686 |
"fp16": {
|
687 |
-
"enabled":
|
688 |
-
"loss_scale": 0,
|
689 |
-
"loss_scale_window": 1000,
|
690 |
-
"initial_scale_power": 16,
|
691 |
-
"hysteresis": 2,
|
692 |
-
"min_loss_scale": 1
|
693 |
},
|
694 |
"bf16": {
|
695 |
-
"enabled":
|
696 |
},
|
697 |
"zero_optimization": {
|
698 |
"stage": 3,
|
@@ -708,60 +668,88 @@ def train_model(
|
|
708 |
"contiguous_gradients": True,
|
709 |
"reduce_bucket_size": "auto",
|
710 |
"stage3_prefetch_bucket_size": "auto",
|
711 |
-
"stage3_param_persistence_threshold": "auto"
|
712 |
-
"gather_16bit_weights_on_model_save": True,
|
713 |
-
"stage3_max_live_parameters": 1e9,
|
714 |
-
"stage3_max_reuse_distance": 1e9
|
715 |
},
|
716 |
-
"gradient_accumulation_steps":
|
717 |
-
"
|
718 |
-
"
|
719 |
-
"
|
720 |
-
"train_micro_batch_size_per_gpu": "auto",
|
721 |
-
"wall_clock_breakdown": False
|
722 |
}
|
723 |
|
|
|
724 |
with open("ds_config.json", "w") as f:
|
725 |
json.dump(ds_config, f, indent=4)
|
726 |
|
727 |
-
|
|
|
|
|
728 |
progress(0.75, desc="Setting up training arguments...")
|
|
|
|
|
|
|
|
|
729 |
training_args = TrainingArguments(
|
730 |
-
output_dir=
|
731 |
-
num_train_epochs=
|
732 |
-
per_device_train_batch_size=
|
733 |
-
gradient_accumulation_steps=
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
logging_steps=
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
lr_scheduler_type=
|
745 |
report_to="tensorboard",
|
746 |
bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
|
747 |
gradient_checkpointing=True,
|
748 |
gradient_checkpointing_kwargs={'use_reentrant': False},
|
749 |
|
750 |
-
#
|
751 |
-
deepspeed="ds_config.json",
|
752 |
ddp_find_unused_parameters=False,
|
753 |
)
|
754 |
|
755 |
-
#
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
765 |
|
766 |
# --- Start Training ---
|
767 |
# Clear cache before starting
|
|
|
643 |
|
644 |
data_collator = seq2seq_causal_collator
|
645 |
|
646 |
+
# --- DeepSpeed Configuration ---
|
647 |
+
# Create DeepSpeed config file directly in Python instead of loading from a file
|
648 |
+
progress(0.15, desc="Setting up DeepSpeed configuration...")
|
649 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
650 |
ds_config = {
|
651 |
"fp16": {
|
652 |
+
"enabled": False
|
|
|
|
|
|
|
|
|
|
|
653 |
},
|
654 |
"bf16": {
|
655 |
+
"enabled": True
|
656 |
},
|
657 |
"zero_optimization": {
|
658 |
"stage": 3,
|
|
|
668 |
"contiguous_gradients": True,
|
669 |
"reduce_bucket_size": "auto",
|
670 |
"stage3_prefetch_bucket_size": "auto",
|
671 |
+
"stage3_param_persistence_threshold": "auto"
|
|
|
|
|
|
|
672 |
},
|
673 |
+
"gradient_accumulation_steps": grad_accum_steps,
|
674 |
+
"train_micro_batch_size_per_gpu": batch_size,
|
675 |
+
"gradient_clipping": 1.0,
|
676 |
+
"steps_per_print": 10
|
|
|
|
|
677 |
}
|
678 |
|
679 |
+
# Save the config to a file
|
680 |
with open("ds_config.json", "w") as f:
|
681 |
json.dump(ds_config, f, indent=4)
|
682 |
|
683 |
+
log.append("DeepSpeed configuration created successfully")
|
684 |
+
|
685 |
+
# --- Training Arguments ---
|
686 |
progress(0.75, desc="Setting up training arguments...")
|
687 |
+
output_dir = f"./results_{model_repo_name}"
|
688 |
+
os.makedirs(output_dir, exist_ok=True)
|
689 |
+
|
690 |
+
# Create training arguments without DeepSpeed first
|
691 |
training_args = TrainingArguments(
|
692 |
+
output_dir=output_dir,
|
693 |
+
num_train_epochs=float(epochs),
|
694 |
+
per_device_train_batch_size=batch_size,
|
695 |
+
gradient_accumulation_steps=grad_accum_steps,
|
696 |
+
learning_rate=learning_rate,
|
697 |
+
weight_decay=0.01,
|
698 |
+
logging_dir=f"{output_dir}/logs",
|
699 |
+
logging_steps=10,
|
700 |
+
save_steps=100,
|
701 |
+
save_total_limit=3,
|
702 |
+
remove_unused_columns=False,
|
703 |
+
push_to_hub=False,
|
704 |
+
disable_tqdm=False,
|
705 |
+
warmup_ratio=0.03,
|
706 |
+
lr_scheduler_type="cosine",
|
707 |
report_to="tensorboard",
|
708 |
bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
|
709 |
gradient_checkpointing=True,
|
710 |
gradient_checkpointing_kwargs={'use_reentrant': False},
|
711 |
|
712 |
+
# For multi-GPU - use a different approach for DeepSpeed
|
|
|
713 |
ddp_find_unused_parameters=False,
|
714 |
)
|
715 |
|
716 |
+
# Now initialize DeepSpeed separately
|
717 |
+
if n_gpus > 1:
|
718 |
+
log.append("Setting up DeepSpeed for multi-GPU training")
|
719 |
+
try:
|
720 |
+
import deepspeed
|
721 |
+
from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
|
722 |
+
|
723 |
+
# Modify the trainer to use DeepSpeed
|
724 |
+
trainer_kwargs = {
|
725 |
+
"model": model_to_train,
|
726 |
+
"args": training_args,
|
727 |
+
"train_dataset": train_dataset,
|
728 |
+
"data_collator": data_collator,
|
729 |
+
"deepspeed": ds_config, # Pass the config as a dict
|
730 |
+
}
|
731 |
+
|
732 |
+
trainer = Trainer(**trainer_kwargs)
|
733 |
+
log.append("Trainer initialized with DeepSpeed for multi-GPU training")
|
734 |
+
except Exception as e:
|
735 |
+
log.append(f"Warning: Could not initialize DeepSpeed: {e}")
|
736 |
+
# Fallback to standard distributed training
|
737 |
+
trainer = Trainer(
|
738 |
+
model=model_to_train,
|
739 |
+
args=training_args,
|
740 |
+
train_dataset=train_dataset,
|
741 |
+
data_collator=data_collator,
|
742 |
+
)
|
743 |
+
log.append("Falling back to standard distributed training")
|
744 |
+
else:
|
745 |
+
# Single GPU setup
|
746 |
+
trainer = Trainer(
|
747 |
+
model=model_to_train,
|
748 |
+
args=training_args,
|
749 |
+
train_dataset=train_dataset,
|
750 |
+
data_collator=data_collator,
|
751 |
+
)
|
752 |
+
log.append("Trainer initialized for single GPU training")
|
753 |
|
754 |
# --- Start Training ---
|
755 |
# Clear cache before starting
|