Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +31 -8
run_cloud_training.py
CHANGED
@@ -2,17 +2,20 @@
|
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
"""
|
5 |
-
Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-bnb-4bit using unsloth
|
6 |
RESEARCH TRAINING PHASE ONLY - No output generation
|
7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
|
|
8 |
"""
|
9 |
|
10 |
# Set critical environment variables before any imports
|
11 |
import os
|
12 |
-
# Configure PyTorch memory allocator for better memory management with
|
13 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
14 |
os.environ["XFORMERS_DISABLED"] = "1"
|
15 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
|
|
|
16 |
|
17 |
import json
|
18 |
import logging
|
@@ -597,10 +600,19 @@ def train(config_path, dataset_name, output_dir):
|
|
597 |
# Initialize ds_config_path to None before checking
|
598 |
ds_config_path = None
|
599 |
|
600 |
-
# Optimize batch size for
|
601 |
-
|
602 |
-
|
603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
|
605 |
# Check if DeepSpeed config is available and if MPI is disabled
|
606 |
deepspeed_config = config.get("deepspeed_config", None)
|
@@ -617,6 +629,17 @@ def train(config_path, dataset_name, output_dir):
|
|
617 |
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
618 |
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
619 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
620 |
# Ensure communication backend is set to avoid MPI
|
621 |
if "communication_data_type" not in deepspeed_config:
|
622 |
deepspeed_config["communication_data_type"] = "fp16"
|
@@ -764,7 +787,7 @@ def train(config_path, dataset_name, output_dir):
|
|
764 |
remove_training_marker()
|
765 |
|
766 |
if __name__ == "__main__":
|
767 |
-
parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-4bit model (RESEARCH ONLY)")
|
768 |
parser.add_argument("--config", type=str, default="transformers_config.json",
|
769 |
help="Path to the transformers config JSON file")
|
770 |
parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",
|
|
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
"""
|
5 |
+
Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit using unsloth
|
6 |
RESEARCH TRAINING PHASE ONLY - No output generation
|
7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
8 |
+
OPTIMIZED FOR L40S GPU (48GB VRAM)
|
9 |
"""
|
10 |
|
11 |
# Set critical environment variables before any imports
|
12 |
import os
|
13 |
+
# Configure PyTorch memory allocator for better memory management with L40S GPU
|
14 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
|
15 |
os.environ["XFORMERS_DISABLED"] = "1"
|
16 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
17 |
+
# L40S-specific CUDA optimization
|
18 |
+
os.environ["CUDA_AUTO_BOOST"] = "1"
|
19 |
|
20 |
import json
|
21 |
import logging
|
|
|
600 |
# Initialize ds_config_path to None before checking
|
601 |
ds_config_path = None
|
602 |
|
603 |
+
# Optimize batch size for L40S GPU
|
604 |
+
gpu_info = torch.cuda.get_device_properties(0)
|
605 |
+
logger.info(f"GPU Model: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
|
606 |
+
|
607 |
+
# For L40S GPU, we can use a larger batch size and shard model across the single GPU
|
608 |
+
if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9: # Check if it's L40S (>40GB VRAM)
|
609 |
+
logger.info("Detected L40S GPU - optimizing for high-memory GPU")
|
610 |
+
per_device_train_batch_size = training_config.get("per_device_train_batch_size", 6)
|
611 |
+
logger.info(f"Using optimized batch size for L40S: {per_device_train_batch_size}")
|
612 |
+
else:
|
613 |
+
# Default to a smaller batch size for other GPUs
|
614 |
+
per_device_train_batch_size = 2
|
615 |
+
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
616 |
|
617 |
# Check if DeepSpeed config is available and if MPI is disabled
|
618 |
deepspeed_config = config.get("deepspeed_config", None)
|
|
|
629 |
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
630 |
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
631 |
|
632 |
+
# L40S-specific optimization: Enable ZeRO stage 2 with CPU offloading
|
633 |
+
if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
|
634 |
+
logger.info("Configuring DeepSpeed specifically for L40S GPU")
|
635 |
+
# Adjust ZeRO stage for L40S (48GB VRAM)
|
636 |
+
deepspeed_config["zero_optimization"]["stage"] = 2
|
637 |
+
# Enable CPU offloading for optimizer states to save GPU memory
|
638 |
+
deepspeed_config["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
639 |
+
# Adjust communication efficiency for single high-end GPU
|
640 |
+
deepspeed_config["reduce_bucket_size"] = 1e9
|
641 |
+
deepspeed_config["allgather_bucket_size"] = 1e9
|
642 |
+
|
643 |
# Ensure communication backend is set to avoid MPI
|
644 |
if "communication_data_type" not in deepspeed_config:
|
645 |
deepspeed_config["communication_data_type"] = "fp16"
|
|
|
787 |
remove_training_marker()
|
788 |
|
789 |
if __name__ == "__main__":
|
790 |
+
parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit model (RESEARCH ONLY)")
|
791 |
parser.add_argument("--config", type=str, default="transformers_config.json",
|
792 |
help="Path to the transformers config JSON file")
|
793 |
parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",
|