Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +34 -31
run_cloud_training.py
CHANGED
|
@@ -568,8 +568,40 @@ def train(config_path, dataset_name, output_dir):
|
|
| 568 |
# Force eager attention implementation
|
| 569 |
use_flash_attention = False # Override to force eager implementation
|
| 570 |
|
| 571 |
-
#
|
| 572 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
|
| 574 |
# Initialize model with our safe loading function
|
| 575 |
logger.info("Loading pre-quantized model with eager attention")
|
|
@@ -617,35 +649,6 @@ def train(config_path, dataset_name, output_dir):
|
|
| 617 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
| 618 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
| 619 |
|
| 620 |
-
# Check if DeepSpeed config is available
|
| 621 |
-
deepspeed_config = config.get("deepspeed_config", None)
|
| 622 |
-
if deepspeed_config:
|
| 623 |
-
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
| 624 |
-
|
| 625 |
-
# Create a temporary DeepSpeed config file
|
| 626 |
-
ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
|
| 627 |
-
|
| 628 |
-
# Update DeepSpeed config with dynamic values
|
| 629 |
-
if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
|
| 630 |
-
deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
|
| 631 |
-
|
| 632 |
-
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
| 633 |
-
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
| 634 |
-
|
| 635 |
-
# Write the DeepSpeed config to a file
|
| 636 |
-
with open(ds_config_path, 'w') as f:
|
| 637 |
-
json.dump(deepspeed_config, f, indent=2)
|
| 638 |
-
|
| 639 |
-
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
| 640 |
-
logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
|
| 641 |
-
|
| 642 |
-
# Enable CPU offloading if configured
|
| 643 |
-
if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
|
| 644 |
-
logger.info("DeepSpeed CPU offloading enabled for optimizer states")
|
| 645 |
-
else:
|
| 646 |
-
logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
|
| 647 |
-
ds_config_path = None
|
| 648 |
-
|
| 649 |
training_args_dict = {
|
| 650 |
"output_dir": output_dir,
|
| 651 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
|
|
|
| 568 |
# Force eager attention implementation
|
| 569 |
use_flash_attention = False # Override to force eager implementation
|
| 570 |
|
| 571 |
+
# Initialize ds_config_path to None before checking
|
| 572 |
+
ds_config_path = None
|
| 573 |
+
|
| 574 |
+
# Check if DeepSpeed config is available
|
| 575 |
+
deepspeed_config = config.get("deepspeed_config", None)
|
| 576 |
+
if deepspeed_config:
|
| 577 |
+
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
| 578 |
+
|
| 579 |
+
# Create a temporary DeepSpeed config file
|
| 580 |
+
ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
|
| 581 |
+
|
| 582 |
+
# Update DeepSpeed config with dynamic values
|
| 583 |
+
if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
|
| 584 |
+
deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
|
| 585 |
+
|
| 586 |
+
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
| 587 |
+
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
| 588 |
+
|
| 589 |
+
# Write the DeepSpeed config to a file
|
| 590 |
+
with open(ds_config_path, 'w') as f:
|
| 591 |
+
json.dump(deepspeed_config, f, indent=2)
|
| 592 |
+
|
| 593 |
+
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
| 594 |
+
logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
|
| 595 |
+
|
| 596 |
+
# Enable CPU offloading if configured
|
| 597 |
+
if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
|
| 598 |
+
logger.info("DeepSpeed CPU offloading enabled for optimizer states")
|
| 599 |
+
|
| 600 |
+
# Set using_deepspeed flag
|
| 601 |
+
using_deepspeed = True
|
| 602 |
+
else:
|
| 603 |
+
logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
|
| 604 |
+
using_deepspeed = False
|
| 605 |
|
| 606 |
# Initialize model with our safe loading function
|
| 607 |
logger.info("Loading pre-quantized model with eager attention")
|
|
|
|
| 649 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
| 650 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
| 651 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
training_args_dict = {
|
| 653 |
"output_dir": output_dir,
|
| 654 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|