Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 15

Commit

2281f75

verified ·

1 Parent(s): 18257ed

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +31 -8

run_cloud_training.py CHANGED Viewed

@@ -2,17 +2,20 @@
 # -*- coding: utf-8 -*-
 """
-Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-bnb-4bit using unsloth
 RESEARCH TRAINING PHASE ONLY - No output generation
 WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
 """
 # Set critical environment variables before any imports
 import os
-# Configure PyTorch memory allocator for better memory management with multiple GPUs
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 os.environ["XFORMERS_DISABLED"] = "1"
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 import json
 import logging
@@ -597,10 +600,19 @@ def train(config_path, dataset_name, output_dir):
         # Initialize ds_config_path to None before checking
         ds_config_path = None
-        # Optimize batch size for multi-GPU setup
-        # For 4x L4 GPUs (24GB each), we can safely use a larger batch size
-        per_device_train_batch_size = 4 if gpu_count >= 4 else 2
-        logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
         # Check if DeepSpeed config is available and if MPI is disabled
         deepspeed_config = config.get("deepspeed_config", None)
@@ -617,6 +629,17 @@ def train(config_path, dataset_name, output_dir):
             if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
                 deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
             # Ensure communication backend is set to avoid MPI
             if "communication_data_type" not in deepspeed_config:
                 deepspeed_config["communication_data_type"] = "fp16"
@@ -764,7 +787,7 @@ def train(config_path, dataset_name, output_dir):
         remove_training_marker()
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-4bit model (RESEARCH ONLY)")
     parser.add_argument("--config", type=str, default="transformers_config.json",
                         help="Path to the transformers config JSON file")
     parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",

 # -*- coding: utf-8 -*-
 """
+Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit using unsloth
 RESEARCH TRAINING PHASE ONLY - No output generation
 WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
+OPTIMIZED FOR L40S GPU (48GB VRAM)
 """
 # Set critical environment variables before any imports
 import os
+# Configure PyTorch memory allocator for better memory management with L40S GPU
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
 os.environ["XFORMERS_DISABLED"] = "1"
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+# L40S-specific CUDA optimization
+os.environ["CUDA_AUTO_BOOST"] = "1"
 import json
 import logging
         # Initialize ds_config_path to None before checking
         ds_config_path = None
+        # Optimize batch size for L40S GPU
+        gpu_info = torch.cuda.get_device_properties(0)
+        logger.info(f"GPU Model: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
+        # For L40S GPU, we can use a larger batch size and shard model across the single GPU
+        if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:  # Check if it's L40S (>40GB VRAM)
+            logger.info("Detected L40S GPU - optimizing for high-memory GPU")
+            per_device_train_batch_size = training_config.get("per_device_train_batch_size", 6)
+            logger.info(f"Using optimized batch size for L40S: {per_device_train_batch_size}")
+        else:
+            # Default to a smaller batch size for other GPUs
+            per_device_train_batch_size = 2
+            logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
         # Check if DeepSpeed config is available and if MPI is disabled
         deepspeed_config = config.get("deepspeed_config", None)
             if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
                 deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
+            # L40S-specific optimization: Enable ZeRO stage 2 with CPU offloading
+            if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
+                logger.info("Configuring DeepSpeed specifically for L40S GPU")
+                # Adjust ZeRO stage for L40S (48GB VRAM)
+                deepspeed_config["zero_optimization"]["stage"] = 2
+                # Enable CPU offloading for optimizer states to save GPU memory
+                deepspeed_config["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
+                # Adjust communication efficiency for single high-end GPU
+                deepspeed_config["reduce_bucket_size"] = 1e9
+                deepspeed_config["allgather_bucket_size"] = 1e9
             # Ensure communication backend is set to avoid MPI
             if "communication_data_type" not in deepspeed_config:
                 deepspeed_config["communication_data_type"] = "fp16"
         remove_training_marker()
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit model (RESEARCH ONLY)")
     parser.add_argument("--config", type=str, default="transformers_config.json",
                         help="Path to the transformers config JSON file")
     parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",