Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +34 -31
run_cloud_training.py
CHANGED
@@ -568,8 +568,40 @@ def train(config_path, dataset_name, output_dir):
|
|
568 |
# Force eager attention implementation
|
569 |
use_flash_attention = False # Override to force eager implementation
|
570 |
|
571 |
-
#
|
572 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
573 |
|
574 |
# Initialize model with our safe loading function
|
575 |
logger.info("Loading pre-quantized model with eager attention")
|
@@ -617,35 +649,6 @@ def train(config_path, dataset_name, output_dir):
|
|
617 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
618 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
619 |
|
620 |
-
# Check if DeepSpeed config is available
|
621 |
-
deepspeed_config = config.get("deepspeed_config", None)
|
622 |
-
if deepspeed_config:
|
623 |
-
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
624 |
-
|
625 |
-
# Create a temporary DeepSpeed config file
|
626 |
-
ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
|
627 |
-
|
628 |
-
# Update DeepSpeed config with dynamic values
|
629 |
-
if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
|
630 |
-
deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
|
631 |
-
|
632 |
-
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
633 |
-
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
634 |
-
|
635 |
-
# Write the DeepSpeed config to a file
|
636 |
-
with open(ds_config_path, 'w') as f:
|
637 |
-
json.dump(deepspeed_config, f, indent=2)
|
638 |
-
|
639 |
-
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
640 |
-
logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
|
641 |
-
|
642 |
-
# Enable CPU offloading if configured
|
643 |
-
if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
|
644 |
-
logger.info("DeepSpeed CPU offloading enabled for optimizer states")
|
645 |
-
else:
|
646 |
-
logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
|
647 |
-
ds_config_path = None
|
648 |
-
|
649 |
training_args_dict = {
|
650 |
"output_dir": output_dir,
|
651 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
|
|
568 |
# Force eager attention implementation
|
569 |
use_flash_attention = False # Override to force eager implementation
|
570 |
|
571 |
+
# Initialize ds_config_path to None before checking
|
572 |
+
ds_config_path = None
|
573 |
+
|
574 |
+
# Check if DeepSpeed config is available
|
575 |
+
deepspeed_config = config.get("deepspeed_config", None)
|
576 |
+
if deepspeed_config:
|
577 |
+
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
578 |
+
|
579 |
+
# Create a temporary DeepSpeed config file
|
580 |
+
ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
|
581 |
+
|
582 |
+
# Update DeepSpeed config with dynamic values
|
583 |
+
if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
|
584 |
+
deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
|
585 |
+
|
586 |
+
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
587 |
+
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
588 |
+
|
589 |
+
# Write the DeepSpeed config to a file
|
590 |
+
with open(ds_config_path, 'w') as f:
|
591 |
+
json.dump(deepspeed_config, f, indent=2)
|
592 |
+
|
593 |
+
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
594 |
+
logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
|
595 |
+
|
596 |
+
# Enable CPU offloading if configured
|
597 |
+
if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
|
598 |
+
logger.info("DeepSpeed CPU offloading enabled for optimizer states")
|
599 |
+
|
600 |
+
# Set using_deepspeed flag
|
601 |
+
using_deepspeed = True
|
602 |
+
else:
|
603 |
+
logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
|
604 |
+
using_deepspeed = False
|
605 |
|
606 |
# Initialize model with our safe loading function
|
607 |
logger.info("Loading pre-quantized model with eager attention")
|
|
|
649 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
650 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
651 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
652 |
training_args_dict = {
|
653 |
"output_dir": output_dir,
|
654 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|