George-API commited on
Commit
2281f75
·
verified ·
1 Parent(s): 18257ed

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +31 -8
run_cloud_training.py CHANGED
@@ -2,17 +2,20 @@
2
  # -*- coding: utf-8 -*-
3
 
4
  """
5
- Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-bnb-4bit using unsloth
6
  RESEARCH TRAINING PHASE ONLY - No output generation
7
  WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
 
8
  """
9
 
10
  # Set critical environment variables before any imports
11
  import os
12
- # Configure PyTorch memory allocator for better memory management with multiple GPUs
13
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
14
  os.environ["XFORMERS_DISABLED"] = "1"
15
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 
 
16
 
17
  import json
18
  import logging
@@ -597,10 +600,19 @@ def train(config_path, dataset_name, output_dir):
597
  # Initialize ds_config_path to None before checking
598
  ds_config_path = None
599
 
600
- # Optimize batch size for multi-GPU setup
601
- # For 4x L4 GPUs (24GB each), we can safely use a larger batch size
602
- per_device_train_batch_size = 4 if gpu_count >= 4 else 2
603
- logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
 
 
 
 
 
 
 
 
 
604
 
605
  # Check if DeepSpeed config is available and if MPI is disabled
606
  deepspeed_config = config.get("deepspeed_config", None)
@@ -617,6 +629,17 @@ def train(config_path, dataset_name, output_dir):
617
  if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
618
  deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
619
 
 
 
 
 
 
 
 
 
 
 
 
620
  # Ensure communication backend is set to avoid MPI
621
  if "communication_data_type" not in deepspeed_config:
622
  deepspeed_config["communication_data_type"] = "fp16"
@@ -764,7 +787,7 @@ def train(config_path, dataset_name, output_dir):
764
  remove_training_marker()
765
 
766
  if __name__ == "__main__":
767
- parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-4bit model (RESEARCH ONLY)")
768
  parser.add_argument("--config", type=str, default="transformers_config.json",
769
  help="Path to the transformers config JSON file")
770
  parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",
 
2
  # -*- coding: utf-8 -*-
3
 
4
  """
5
+ Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit using unsloth
6
  RESEARCH TRAINING PHASE ONLY - No output generation
7
  WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
8
+ OPTIMIZED FOR L40S GPU (48GB VRAM)
9
  """
10
 
11
  # Set critical environment variables before any imports
12
  import os
13
+ # Configure PyTorch memory allocator for better memory management with L40S GPU
14
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
15
  os.environ["XFORMERS_DISABLED"] = "1"
16
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
17
+ # L40S-specific CUDA optimization
18
+ os.environ["CUDA_AUTO_BOOST"] = "1"
19
 
20
  import json
21
  import logging
 
600
  # Initialize ds_config_path to None before checking
601
  ds_config_path = None
602
 
603
+ # Optimize batch size for L40S GPU
604
+ gpu_info = torch.cuda.get_device_properties(0)
605
+ logger.info(f"GPU Model: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
606
+
607
+ # For L40S GPU, we can use a larger batch size and shard model across the single GPU
608
+ if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9: # Check if it's L40S (>40GB VRAM)
609
+ logger.info("Detected L40S GPU - optimizing for high-memory GPU")
610
+ per_device_train_batch_size = training_config.get("per_device_train_batch_size", 6)
611
+ logger.info(f"Using optimized batch size for L40S: {per_device_train_batch_size}")
612
+ else:
613
+ # Default to a smaller batch size for other GPUs
614
+ per_device_train_batch_size = 2
615
+ logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
616
 
617
  # Check if DeepSpeed config is available and if MPI is disabled
618
  deepspeed_config = config.get("deepspeed_config", None)
 
629
  if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
630
  deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
631
 
632
+ # L40S-specific optimization: Enable ZeRO stage 2 with CPU offloading
633
+ if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
634
+ logger.info("Configuring DeepSpeed specifically for L40S GPU")
635
+ # Adjust ZeRO stage for L40S (48GB VRAM)
636
+ deepspeed_config["zero_optimization"]["stage"] = 2
637
+ # Enable CPU offloading for optimizer states to save GPU memory
638
+ deepspeed_config["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
639
+ # Adjust communication efficiency for single high-end GPU
640
+ deepspeed_config["reduce_bucket_size"] = 1e9
641
+ deepspeed_config["allgather_bucket_size"] = 1e9
642
+
643
  # Ensure communication backend is set to avoid MPI
644
  if "communication_data_type" not in deepspeed_config:
645
  deepspeed_config["communication_data_type"] = "fp16"
 
787
  remove_training_marker()
788
 
789
  if __name__ == "__main__":
790
+ parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit model (RESEARCH ONLY)")
791
  parser.add_argument("--config", type=str, default="transformers_config.json",
792
  help="Path to the transformers config JSON file")
793
  parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",