George-API commited on
Commit
aa250a7
·
verified ·
1 Parent(s): fa0ae8d

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +34 -31
run_cloud_training.py CHANGED
@@ -568,8 +568,40 @@ def train(config_path, dataset_name, output_dir):
568
  # Force eager attention implementation
569
  use_flash_attention = False # Override to force eager implementation
570
 
571
- # Check if we're using DeepSpeed
572
- using_deepspeed = ds_config_path is not None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
 
574
  # Initialize model with our safe loading function
575
  logger.info("Loading pre-quantized model with eager attention")
@@ -617,35 +649,6 @@ def train(config_path, dataset_name, output_dir):
617
  per_device_train_batch_size = 4 if gpu_count >= 4 else 2
618
  logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
619
 
620
- # Check if DeepSpeed config is available
621
- deepspeed_config = config.get("deepspeed_config", None)
622
- if deepspeed_config:
623
- logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
624
-
625
- # Create a temporary DeepSpeed config file
626
- ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
627
-
628
- # Update DeepSpeed config with dynamic values
629
- if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
630
- deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
631
-
632
- if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
633
- deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
634
-
635
- # Write the DeepSpeed config to a file
636
- with open(ds_config_path, 'w') as f:
637
- json.dump(deepspeed_config, f, indent=2)
638
-
639
- logger.info(f"Created DeepSpeed config at {ds_config_path}")
640
- logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
641
-
642
- # Enable CPU offloading if configured
643
- if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
644
- logger.info("DeepSpeed CPU offloading enabled for optimizer states")
645
- else:
646
- logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
647
- ds_config_path = None
648
-
649
  training_args_dict = {
650
  "output_dir": output_dir,
651
  "num_train_epochs": training_config.get("num_train_epochs", 3),
 
568
  # Force eager attention implementation
569
  use_flash_attention = False # Override to force eager implementation
570
 
571
+ # Initialize ds_config_path to None before checking
572
+ ds_config_path = None
573
+
574
+ # Check if DeepSpeed config is available
575
+ deepspeed_config = config.get("deepspeed_config", None)
576
+ if deepspeed_config:
577
+ logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
578
+
579
+ # Create a temporary DeepSpeed config file
580
+ ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
581
+
582
+ # Update DeepSpeed config with dynamic values
583
+ if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
584
+ deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
585
+
586
+ if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
587
+ deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
588
+
589
+ # Write the DeepSpeed config to a file
590
+ with open(ds_config_path, 'w') as f:
591
+ json.dump(deepspeed_config, f, indent=2)
592
+
593
+ logger.info(f"Created DeepSpeed config at {ds_config_path}")
594
+ logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
595
+
596
+ # Enable CPU offloading if configured
597
+ if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
598
+ logger.info("DeepSpeed CPU offloading enabled for optimizer states")
599
+
600
+ # Set using_deepspeed flag
601
+ using_deepspeed = True
602
+ else:
603
+ logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
604
+ using_deepspeed = False
605
 
606
  # Initialize model with our safe loading function
607
  logger.info("Loading pre-quantized model with eager attention")
 
649
  per_device_train_batch_size = 4 if gpu_count >= 4 else 2
650
  logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
  training_args_dict = {
653
  "output_dir": output_dir,
654
  "num_train_epochs": training_config.get("num_train_epochs", 3),