George-API commited on
Commit
30060ee
·
verified ·
1 Parent(s): 17796ef

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +29 -54
run_cloud_training.py CHANGED
@@ -6,6 +6,7 @@ Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit using unslo
6
  RESEARCH TRAINING PHASE ONLY - No output generation
7
  WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
8
  OPTIMIZED FOR L40S GPU (48GB VRAM)
 
9
  """
10
 
11
  # Set critical environment variables before any imports
@@ -17,6 +18,9 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
17
  # L40S-specific CUDA optimization
18
  os.environ["CUDA_AUTO_BOOST"] = "1"
19
 
 
 
 
20
  import json
21
  import logging
22
  import argparse
@@ -31,32 +35,35 @@ from transformers.data.data_collator import DataCollatorMixin
31
  from peft import LoraConfig
32
  from unsloth import FastLanguageModel
33
 
34
- # Set DeepSpeed environment variables to disable MPI
 
 
 
 
 
 
 
 
 
 
 
35
  os.environ["MASTER_ADDR"] = "localhost"
36
  os.environ["MASTER_PORT"] = "9994"
37
  os.environ["RANK"] = "0"
38
  os.environ["LOCAL_RANK"] = "0"
39
  os.environ["WORLD_SIZE"] = "1"
40
 
41
- # Try to import deepspeed, install mpi4py if needed
 
42
  try:
43
  import deepspeed
 
 
44
  except ImportError as e:
45
- if "mpi4py" in str(e):
46
- logger.warning("mpi4py not found, installing...")
47
- import subprocess
48
- try:
49
- subprocess.check_call([sys.executable, "-m", "pip", "install", "mpi4py"])
50
- import deepspeed
51
- logger.info("Successfully installed mpi4py and imported deepspeed")
52
- except Exception as install_error:
53
- logger.warning(f"Failed to install mpi4py: {install_error}")
54
- logger.warning("Continuing without DeepSpeed MPI support")
55
- # Set a flag to disable DeepSpeed later
56
- os.environ["DISABLE_DEEPSPEED_MPI"] = "1"
57
- else:
58
- logger.error(f"Failed to import deepspeed: {e}")
59
- raise
60
 
61
  # Disable all attention optimizations that might cause issues
62
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
@@ -84,17 +91,6 @@ class XFormersBlocker:
84
  # Add our import blocker to sys.meta_path
85
  sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
86
 
87
- # Configure logging first
88
- logging.basicConfig(
89
- level=logging.INFO,
90
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
91
- handlers=[
92
- logging.StreamHandler(),
93
- logging.FileHandler("training.log")
94
- ]
95
- )
96
- logger = logging.getLogger(__name__)
97
-
98
  # Make sure torch is installed and available before proceeding
99
  try:
100
  logger.info("Importing torch...")
@@ -614,9 +610,9 @@ def train(config_path, dataset_name, output_dir):
614
  per_device_train_batch_size = 2
615
  logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
616
 
617
- # Check if DeepSpeed config is available and if MPI is disabled
618
  deepspeed_config = config.get("deepspeed_config", None)
619
- if deepspeed_config and os.environ.get("DISABLE_DEEPSPEED_MPI", "0") != "1":
620
  logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
621
 
622
  # Create a temporary DeepSpeed config file
@@ -629,40 +625,19 @@ def train(config_path, dataset_name, output_dir):
629
  if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
630
  deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
631
 
632
- # L40S-specific optimization: Enable ZeRO stage 2 with CPU offloading
633
- if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
634
- logger.info("Configuring DeepSpeed specifically for L40S GPU")
635
- # Adjust ZeRO stage for L40S (48GB VRAM)
636
- deepspeed_config["zero_optimization"]["stage"] = 2
637
- # Enable CPU offloading for optimizer states to save GPU memory
638
- deepspeed_config["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
639
- # Adjust communication efficiency for single high-end GPU
640
- deepspeed_config["reduce_bucket_size"] = 1e9
641
- deepspeed_config["allgather_bucket_size"] = 1e9
642
-
643
- # Ensure communication backend is set to avoid MPI
644
- if "communication_data_type" not in deepspeed_config:
645
- deepspeed_config["communication_data_type"] = "fp16"
646
-
647
  # Write the DeepSpeed config to a file
648
  with open(ds_config_path, 'w') as f:
649
  json.dump(deepspeed_config, f, indent=2)
650
 
651
  logger.info(f"Created DeepSpeed config at {ds_config_path}")
652
- logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
653
-
654
- # Enable CPU offloading if configured
655
- if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
656
- logger.info("DeepSpeed CPU offloading enabled for optimizer states")
657
-
658
  # Set using_deepspeed flag
659
  using_deepspeed = True
660
- elif os.environ.get("DISABLE_DEEPSPEED_MPI", "0") == "1":
661
- logger.warning("DeepSpeed MPI support is disabled due to missing mpi4py. Continuing without DeepSpeed.")
662
  ds_config_path = None
663
  using_deepspeed = False
664
  else:
665
- logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
666
  ds_config_path = None
667
  using_deepspeed = False
668
 
 
6
  RESEARCH TRAINING PHASE ONLY - No output generation
7
  WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
8
  OPTIMIZED FOR L40S GPU (48GB VRAM)
9
+ SUPPORTS ENVIRONMENTS WITHOUT MPI
10
  """
11
 
12
  # Set critical environment variables before any imports
 
18
  # L40S-specific CUDA optimization
19
  os.environ["CUDA_AUTO_BOOST"] = "1"
20
 
21
+ # Explicitly disable DeepSpeed MPI requirement
22
+ os.environ["DEEPSPEED_MPI_REQUIRED"] = "0"
23
+
24
  import json
25
  import logging
26
  import argparse
 
35
  from peft import LoraConfig
36
  from unsloth import FastLanguageModel
37
 
38
+ # Configure logging first (before any potential errors with imports)
39
+ logging.basicConfig(
40
+ level=logging.INFO,
41
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
42
+ handlers=[
43
+ logging.StreamHandler(),
44
+ logging.FileHandler("training.log")
45
+ ]
46
+ )
47
+ logger = logging.getLogger(__name__)
48
+
49
+ # Set up DeepSpeed without requiring MPI
50
  os.environ["MASTER_ADDR"] = "localhost"
51
  os.environ["MASTER_PORT"] = "9994"
52
  os.environ["RANK"] = "0"
53
  os.environ["LOCAL_RANK"] = "0"
54
  os.environ["WORLD_SIZE"] = "1"
55
 
56
+ # Try to import deepspeed, with fallback for environments without MPI
57
+ deepspeed_available = False
58
  try:
59
  import deepspeed
60
+ deepspeed_available = True
61
+ logger.info("DeepSpeed successfully imported")
62
  except ImportError as e:
63
+ logger.warning(f"Failed to import DeepSpeed: {e}")
64
+ logger.warning("Will continue without DeepSpeed support")
65
+ # Set a flag to disable DeepSpeed
66
+ os.environ["DISABLE_DEEPSPEED"] = "1"
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  # Disable all attention optimizations that might cause issues
69
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 
91
  # Add our import blocker to sys.meta_path
92
  sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
93
 
 
 
 
 
 
 
 
 
 
 
 
94
  # Make sure torch is installed and available before proceeding
95
  try:
96
  logger.info("Importing torch...")
 
610
  per_device_train_batch_size = 2
611
  logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
612
 
613
+ # Check if DeepSpeed config is available and if DeepSpeed is available
614
  deepspeed_config = config.get("deepspeed_config", None)
615
+ if deepspeed_config and deepspeed_available and os.environ.get("DISABLE_DEEPSPEED", "0") != "1":
616
  logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
617
 
618
  # Create a temporary DeepSpeed config file
 
625
  if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
626
  deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
  # Write the DeepSpeed config to a file
629
  with open(ds_config_path, 'w') as f:
630
  json.dump(deepspeed_config, f, indent=2)
631
 
632
  logger.info(f"Created DeepSpeed config at {ds_config_path}")
 
 
 
 
 
 
633
  # Set using_deepspeed flag
634
  using_deepspeed = True
635
+ elif os.environ.get("DISABLE_DEEPSPEED", "0") == "1":
636
+ logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
637
  ds_config_path = None
638
  using_deepspeed = False
639
  else:
640
+ logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
641
  ds_config_path = None
642
  using_deepspeed = False
643