Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +68 -91
run_cloud_training.py
CHANGED
|
@@ -28,46 +28,32 @@ from transformers.data.data_collator import DataCollatorMixin
|
|
| 28 |
from peft import LoraConfig
|
| 29 |
from unsloth import FastLanguageModel
|
| 30 |
|
| 31 |
-
#
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
import subprocess
|
| 38 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "deepspeed"])
|
| 39 |
-
import deepspeed
|
| 40 |
-
logger.info("DeepSpeed installed successfully")
|
| 41 |
-
except Exception as e:
|
| 42 |
-
logger.error(f"Failed to install DeepSpeed: {e}")
|
| 43 |
-
logger.error("Will continue without DeepSpeed")
|
| 44 |
|
| 45 |
-
#
|
| 46 |
try:
|
| 47 |
-
import
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
try:
|
| 52 |
import subprocess
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "mpi4py>=3.1.4"])
|
| 66 |
-
import mpi4py
|
| 67 |
-
logger.info(f"mpi4py installed successfully (version: {mpi4py.__version__})")
|
| 68 |
-
except Exception as e:
|
| 69 |
-
logger.error(f"Failed to install mpi4py: {e}")
|
| 70 |
-
logger.error("DeepSpeed may not work correctly without mpi4py")
|
| 71 |
|
| 72 |
# Disable all attention optimizations that might cause issues
|
| 73 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
@@ -616,53 +602,42 @@ def train(config_path, dataset_name, output_dir):
|
|
| 616 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
| 617 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
| 618 |
|
| 619 |
-
# Check if DeepSpeed config is available
|
| 620 |
deepspeed_config = config.get("deepspeed_config", None)
|
| 621 |
-
if deepspeed_config:
|
| 622 |
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
| 623 |
|
| 624 |
-
#
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
logger.error("mpi4py is required for DeepSpeed but not available")
|
| 631 |
-
logger.error("Will continue without DeepSpeed")
|
| 632 |
|
| 633 |
-
if
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
using_deepspeed = True
|
| 658 |
-
except Exception as e:
|
| 659 |
-
logger.error(f"Failed to initialize DeepSpeed: {e}")
|
| 660 |
-
logger.error("Will continue without DeepSpeed")
|
| 661 |
-
ds_config_path = None
|
| 662 |
-
using_deepspeed = False
|
| 663 |
-
else:
|
| 664 |
-
ds_config_path = None
|
| 665 |
-
using_deepspeed = False
|
| 666 |
else:
|
| 667 |
logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
|
| 668 |
ds_config_path = None
|
|
@@ -709,7 +684,6 @@ def train(config_path, dataset_name, output_dir):
|
|
| 709 |
reports = ["none"]
|
| 710 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
| 711 |
|
| 712 |
-
# Prepare training arguments
|
| 713 |
training_args_dict = {
|
| 714 |
"output_dir": output_dir,
|
| 715 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
|
@@ -734,20 +708,23 @@ def train(config_path, dataset_name, output_dir):
|
|
| 734 |
"dataloader_num_workers": 4, # Use multiple workers for data loading
|
| 735 |
}
|
| 736 |
|
| 737 |
-
# Add DeepSpeed config if available and
|
| 738 |
-
if using_deepspeed and ds_config_path
|
| 739 |
-
logger.info("
|
| 740 |
training_args_dict["deepspeed"] = ds_config_path
|
| 741 |
else:
|
| 742 |
-
logger.info("
|
| 743 |
-
# If DeepSpeed is not available, ensure we're still using distributed training efficiently
|
| 744 |
-
if gpu_count > 1:
|
| 745 |
-
logger.info(f"Using standard distributed training with {gpu_count} GPUs")
|
| 746 |
-
training_args_dict["local_rank"] = int(os.environ.get("LOCAL_RANK", -1))
|
| 747 |
-
training_args_dict["gradient_checkpointing"] = True
|
| 748 |
|
| 749 |
# Create TrainingArguments with validated parameters
|
| 750 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
|
| 752 |
# Create trainer with pre-tokenized collator
|
| 753 |
trainer = Trainer(
|
|
|
|
| 28 |
from peft import LoraConfig
|
| 29 |
from unsloth import FastLanguageModel
|
| 30 |
|
| 31 |
+
# Set DeepSpeed environment variables to disable MPI
|
| 32 |
+
os.environ["MASTER_ADDR"] = "localhost"
|
| 33 |
+
os.environ["MASTER_PORT"] = "9994"
|
| 34 |
+
os.environ["RANK"] = "0"
|
| 35 |
+
os.environ["LOCAL_RANK"] = "0"
|
| 36 |
+
os.environ["WORLD_SIZE"] = "1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
# Try to import deepspeed, install mpi4py if needed
|
| 39 |
try:
|
| 40 |
+
import deepspeed
|
| 41 |
+
except ImportError as e:
|
| 42 |
+
if "mpi4py" in str(e):
|
| 43 |
+
logger.warning("mpi4py not found, installing...")
|
|
|
|
| 44 |
import subprocess
|
| 45 |
+
try:
|
| 46 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "mpi4py"])
|
| 47 |
+
import deepspeed
|
| 48 |
+
logger.info("Successfully installed mpi4py and imported deepspeed")
|
| 49 |
+
except Exception as install_error:
|
| 50 |
+
logger.warning(f"Failed to install mpi4py: {install_error}")
|
| 51 |
+
logger.warning("Continuing without DeepSpeed MPI support")
|
| 52 |
+
# Set a flag to disable DeepSpeed later
|
| 53 |
+
os.environ["DISABLE_DEEPSPEED_MPI"] = "1"
|
| 54 |
+
else:
|
| 55 |
+
logger.error(f"Failed to import deepspeed: {e}")
|
| 56 |
+
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
# Disable all attention optimizations that might cause issues
|
| 59 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
|
|
| 602 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
| 603 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
| 604 |
|
| 605 |
+
# Check if DeepSpeed config is available and if MPI is disabled
|
| 606 |
deepspeed_config = config.get("deepspeed_config", None)
|
| 607 |
+
if deepspeed_config and os.environ.get("DISABLE_DEEPSPEED_MPI", "0") != "1":
|
| 608 |
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
| 609 |
|
| 610 |
+
# Create a temporary DeepSpeed config file
|
| 611 |
+
ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
|
| 612 |
+
|
| 613 |
+
# Update DeepSpeed config with dynamic values
|
| 614 |
+
if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
|
| 615 |
+
deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
|
|
|
|
|
|
|
| 616 |
|
| 617 |
+
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
| 618 |
+
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
| 619 |
+
|
| 620 |
+
# Ensure communication backend is set to avoid MPI
|
| 621 |
+
if "communication_data_type" not in deepspeed_config:
|
| 622 |
+
deepspeed_config["communication_data_type"] = "fp16"
|
| 623 |
+
|
| 624 |
+
# Write the DeepSpeed config to a file
|
| 625 |
+
with open(ds_config_path, 'w') as f:
|
| 626 |
+
json.dump(deepspeed_config, f, indent=2)
|
| 627 |
+
|
| 628 |
+
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
| 629 |
+
logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
|
| 630 |
+
|
| 631 |
+
# Enable CPU offloading if configured
|
| 632 |
+
if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
|
| 633 |
+
logger.info("DeepSpeed CPU offloading enabled for optimizer states")
|
| 634 |
+
|
| 635 |
+
# Set using_deepspeed flag
|
| 636 |
+
using_deepspeed = True
|
| 637 |
+
elif os.environ.get("DISABLE_DEEPSPEED_MPI", "0") == "1":
|
| 638 |
+
logger.warning("DeepSpeed MPI support is disabled due to missing mpi4py. Continuing without DeepSpeed.")
|
| 639 |
+
ds_config_path = None
|
| 640 |
+
using_deepspeed = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
else:
|
| 642 |
logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
|
| 643 |
ds_config_path = None
|
|
|
|
| 684 |
reports = ["none"]
|
| 685 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
| 686 |
|
|
|
|
| 687 |
training_args_dict = {
|
| 688 |
"output_dir": output_dir,
|
| 689 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
|
|
|
| 708 |
"dataloader_num_workers": 4, # Use multiple workers for data loading
|
| 709 |
}
|
| 710 |
|
| 711 |
+
# Add DeepSpeed config path if available and enabled
|
| 712 |
+
if using_deepspeed and ds_config_path:
|
| 713 |
+
logger.info("Adding DeepSpeed configuration to training arguments")
|
| 714 |
training_args_dict["deepspeed"] = ds_config_path
|
| 715 |
else:
|
| 716 |
+
logger.info("DeepSpeed is disabled - using standard distributed training")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
|
| 718 |
# Create TrainingArguments with validated parameters
|
| 719 |
+
try:
|
| 720 |
+
training_args = TrainingArguments(**training_args_dict)
|
| 721 |
+
except Exception as e:
|
| 722 |
+
logger.error(f"Failed to create training arguments with DeepSpeed: {e}")
|
| 723 |
+
if "deepspeed" in training_args_dict:
|
| 724 |
+
logger.warning("Removing DeepSpeed configuration and trying again")
|
| 725 |
+
del training_args_dict["deepspeed"]
|
| 726 |
+
training_args = TrainingArguments(**training_args_dict)
|
| 727 |
+
using_deepspeed = False
|
| 728 |
|
| 729 |
# Create trainer with pre-tokenized collator
|
| 730 |
trainer = Trainer(
|