Spaces:
Running
Running
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +29 -54
run_cloud_training.py
CHANGED
@@ -6,6 +6,7 @@ Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit using unslo
|
|
6 |
RESEARCH TRAINING PHASE ONLY - No output generation
|
7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
8 |
OPTIMIZED FOR L40S GPU (48GB VRAM)
|
|
|
9 |
"""
|
10 |
|
11 |
# Set critical environment variables before any imports
|
@@ -17,6 +18,9 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
17 |
# L40S-specific CUDA optimization
|
18 |
os.environ["CUDA_AUTO_BOOST"] = "1"
|
19 |
|
|
|
|
|
|
|
20 |
import json
|
21 |
import logging
|
22 |
import argparse
|
@@ -31,32 +35,35 @@ from transformers.data.data_collator import DataCollatorMixin
|
|
31 |
from peft import LoraConfig
|
32 |
from unsloth import FastLanguageModel
|
33 |
|
34 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
os.environ["MASTER_ADDR"] = "localhost"
|
36 |
os.environ["MASTER_PORT"] = "9994"
|
37 |
os.environ["RANK"] = "0"
|
38 |
os.environ["LOCAL_RANK"] = "0"
|
39 |
os.environ["WORLD_SIZE"] = "1"
|
40 |
|
41 |
-
# Try to import deepspeed,
|
|
|
42 |
try:
|
43 |
import deepspeed
|
|
|
|
|
44 |
except ImportError as e:
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "mpi4py"])
|
50 |
-
import deepspeed
|
51 |
-
logger.info("Successfully installed mpi4py and imported deepspeed")
|
52 |
-
except Exception as install_error:
|
53 |
-
logger.warning(f"Failed to install mpi4py: {install_error}")
|
54 |
-
logger.warning("Continuing without DeepSpeed MPI support")
|
55 |
-
# Set a flag to disable DeepSpeed later
|
56 |
-
os.environ["DISABLE_DEEPSPEED_MPI"] = "1"
|
57 |
-
else:
|
58 |
-
logger.error(f"Failed to import deepspeed: {e}")
|
59 |
-
raise
|
60 |
|
61 |
# Disable all attention optimizations that might cause issues
|
62 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
@@ -84,17 +91,6 @@ class XFormersBlocker:
|
|
84 |
# Add our import blocker to sys.meta_path
|
85 |
sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
|
86 |
|
87 |
-
# Configure logging first
|
88 |
-
logging.basicConfig(
|
89 |
-
level=logging.INFO,
|
90 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
91 |
-
handlers=[
|
92 |
-
logging.StreamHandler(),
|
93 |
-
logging.FileHandler("training.log")
|
94 |
-
]
|
95 |
-
)
|
96 |
-
logger = logging.getLogger(__name__)
|
97 |
-
|
98 |
# Make sure torch is installed and available before proceeding
|
99 |
try:
|
100 |
logger.info("Importing torch...")
|
@@ -614,9 +610,9 @@ def train(config_path, dataset_name, output_dir):
|
|
614 |
per_device_train_batch_size = 2
|
615 |
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
616 |
|
617 |
-
# Check if DeepSpeed config is available and if
|
618 |
deepspeed_config = config.get("deepspeed_config", None)
|
619 |
-
if deepspeed_config and os.environ.get("
|
620 |
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
621 |
|
622 |
# Create a temporary DeepSpeed config file
|
@@ -629,40 +625,19 @@ def train(config_path, dataset_name, output_dir):
|
|
629 |
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
630 |
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
631 |
|
632 |
-
# L40S-specific optimization: Enable ZeRO stage 2 with CPU offloading
|
633 |
-
if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
|
634 |
-
logger.info("Configuring DeepSpeed specifically for L40S GPU")
|
635 |
-
# Adjust ZeRO stage for L40S (48GB VRAM)
|
636 |
-
deepspeed_config["zero_optimization"]["stage"] = 2
|
637 |
-
# Enable CPU offloading for optimizer states to save GPU memory
|
638 |
-
deepspeed_config["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
639 |
-
# Adjust communication efficiency for single high-end GPU
|
640 |
-
deepspeed_config["reduce_bucket_size"] = 1e9
|
641 |
-
deepspeed_config["allgather_bucket_size"] = 1e9
|
642 |
-
|
643 |
-
# Ensure communication backend is set to avoid MPI
|
644 |
-
if "communication_data_type" not in deepspeed_config:
|
645 |
-
deepspeed_config["communication_data_type"] = "fp16"
|
646 |
-
|
647 |
# Write the DeepSpeed config to a file
|
648 |
with open(ds_config_path, 'w') as f:
|
649 |
json.dump(deepspeed_config, f, indent=2)
|
650 |
|
651 |
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
652 |
-
logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
|
653 |
-
|
654 |
-
# Enable CPU offloading if configured
|
655 |
-
if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
|
656 |
-
logger.info("DeepSpeed CPU offloading enabled for optimizer states")
|
657 |
-
|
658 |
# Set using_deepspeed flag
|
659 |
using_deepspeed = True
|
660 |
-
elif os.environ.get("
|
661 |
-
logger.warning("DeepSpeed
|
662 |
ds_config_path = None
|
663 |
using_deepspeed = False
|
664 |
else:
|
665 |
-
logger.warning("
|
666 |
ds_config_path = None
|
667 |
using_deepspeed = False
|
668 |
|
|
|
6 |
RESEARCH TRAINING PHASE ONLY - No output generation
|
7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
8 |
OPTIMIZED FOR L40S GPU (48GB VRAM)
|
9 |
+
SUPPORTS ENVIRONMENTS WITHOUT MPI
|
10 |
"""
|
11 |
|
12 |
# Set critical environment variables before any imports
|
|
|
18 |
# L40S-specific CUDA optimization
|
19 |
os.environ["CUDA_AUTO_BOOST"] = "1"
|
20 |
|
21 |
+
# Explicitly disable DeepSpeed MPI requirement
|
22 |
+
os.environ["DEEPSPEED_MPI_REQUIRED"] = "0"
|
23 |
+
|
24 |
import json
|
25 |
import logging
|
26 |
import argparse
|
|
|
35 |
from peft import LoraConfig
|
36 |
from unsloth import FastLanguageModel
|
37 |
|
38 |
+
# Configure logging first (before any potential errors with imports)
|
39 |
+
logging.basicConfig(
|
40 |
+
level=logging.INFO,
|
41 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
42 |
+
handlers=[
|
43 |
+
logging.StreamHandler(),
|
44 |
+
logging.FileHandler("training.log")
|
45 |
+
]
|
46 |
+
)
|
47 |
+
logger = logging.getLogger(__name__)
|
48 |
+
|
49 |
+
# Set up DeepSpeed without requiring MPI
|
50 |
os.environ["MASTER_ADDR"] = "localhost"
|
51 |
os.environ["MASTER_PORT"] = "9994"
|
52 |
os.environ["RANK"] = "0"
|
53 |
os.environ["LOCAL_RANK"] = "0"
|
54 |
os.environ["WORLD_SIZE"] = "1"
|
55 |
|
56 |
+
# Try to import deepspeed, with fallback for environments without MPI
|
57 |
+
deepspeed_available = False
|
58 |
try:
|
59 |
import deepspeed
|
60 |
+
deepspeed_available = True
|
61 |
+
logger.info("DeepSpeed successfully imported")
|
62 |
except ImportError as e:
|
63 |
+
logger.warning(f"Failed to import DeepSpeed: {e}")
|
64 |
+
logger.warning("Will continue without DeepSpeed support")
|
65 |
+
# Set a flag to disable DeepSpeed
|
66 |
+
os.environ["DISABLE_DEEPSPEED"] = "1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
# Disable all attention optimizations that might cause issues
|
69 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
|
91 |
# Add our import blocker to sys.meta_path
|
92 |
sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
# Make sure torch is installed and available before proceeding
|
95 |
try:
|
96 |
logger.info("Importing torch...")
|
|
|
610 |
per_device_train_batch_size = 2
|
611 |
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
612 |
|
613 |
+
# Check if DeepSpeed config is available and if DeepSpeed is available
|
614 |
deepspeed_config = config.get("deepspeed_config", None)
|
615 |
+
if deepspeed_config and deepspeed_available and os.environ.get("DISABLE_DEEPSPEED", "0") != "1":
|
616 |
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
617 |
|
618 |
# Create a temporary DeepSpeed config file
|
|
|
625 |
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
626 |
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
627 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
628 |
# Write the DeepSpeed config to a file
|
629 |
with open(ds_config_path, 'w') as f:
|
630 |
json.dump(deepspeed_config, f, indent=2)
|
631 |
|
632 |
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
633 |
# Set using_deepspeed flag
|
634 |
using_deepspeed = True
|
635 |
+
elif os.environ.get("DISABLE_DEEPSPEED", "0") == "1":
|
636 |
+
logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
|
637 |
ds_config_path = None
|
638 |
using_deepspeed = False
|
639 |
else:
|
640 |
+
logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
|
641 |
ds_config_path = None
|
642 |
using_deepspeed = False
|
643 |
|