Spaces:

George-API
/

qwen4bit

Running

App Files Files Community

George-API commited on Mar 15

Commit

30060ee

verified ·

1 Parent(s): 17796ef

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +29 -54

run_cloud_training.py CHANGED Viewed

@@ -6,6 +6,7 @@ Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit using unslo
 RESEARCH TRAINING PHASE ONLY - No output generation
 WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
 OPTIMIZED FOR L40S GPU (48GB VRAM)
 """
 # Set critical environment variables before any imports
@@ -17,6 +18,9 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # L40S-specific CUDA optimization
 os.environ["CUDA_AUTO_BOOST"] = "1"
 import json
 import logging
 import argparse
@@ -31,32 +35,35 @@ from transformers.data.data_collator import DataCollatorMixin
 from peft import LoraConfig
 from unsloth import FastLanguageModel
-# Set DeepSpeed environment variables to disable MPI
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "9994"
 os.environ["RANK"] = "0"
 os.environ["LOCAL_RANK"] = "0"
 os.environ["WORLD_SIZE"] = "1"
-# Try to import deepspeed, install mpi4py if needed
 try:
     import deepspeed
 except ImportError as e:
-    if "mpi4py" in str(e):
-        logger.warning("mpi4py not found, installing...")
-        import subprocess
-        try:
-            subprocess.check_call([sys.executable, "-m", "pip", "install", "mpi4py"])
-            import deepspeed
-            logger.info("Successfully installed mpi4py and imported deepspeed")
-        except Exception as install_error:
-            logger.warning(f"Failed to install mpi4py: {install_error}")
-            logger.warning("Continuing without DeepSpeed MPI support")
-            # Set a flag to disable DeepSpeed later
-            os.environ["DISABLE_DEEPSPEED_MPI"] = "1"
-    else:
-        logger.error(f"Failed to import deepspeed: {e}")
-        raise
 # Disable all attention optimizations that might cause issues
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
@@ -84,17 +91,6 @@ class XFormersBlocker:
 # Add our import blocker to sys.meta_path
 sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
-# Configure logging first
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler(),
-        logging.FileHandler("training.log")
-    ]
-)
-logger = logging.getLogger(__name__)
 # Make sure torch is installed and available before proceeding
 try:
     logger.info("Importing torch...")
@@ -614,9 +610,9 @@ def train(config_path, dataset_name, output_dir):
             per_device_train_batch_size = 2
             logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
-        # Check if DeepSpeed config is available and if MPI is disabled
         deepspeed_config = config.get("deepspeed_config", None)
-        if deepspeed_config and os.environ.get("DISABLE_DEEPSPEED_MPI", "0") != "1":
             logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
             # Create a temporary DeepSpeed config file
@@ -629,40 +625,19 @@ def train(config_path, dataset_name, output_dir):
             if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
                 deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
-            # L40S-specific optimization: Enable ZeRO stage 2 with CPU offloading
-            if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
-                logger.info("Configuring DeepSpeed specifically for L40S GPU")
-                # Adjust ZeRO stage for L40S (48GB VRAM)
-                deepspeed_config["zero_optimization"]["stage"] = 2
-                # Enable CPU offloading for optimizer states to save GPU memory
-                deepspeed_config["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
-                # Adjust communication efficiency for single high-end GPU
-                deepspeed_config["reduce_bucket_size"] = 1e9
-                deepspeed_config["allgather_bucket_size"] = 1e9
-            # Ensure communication backend is set to avoid MPI
-            if "communication_data_type" not in deepspeed_config:
-                deepspeed_config["communication_data_type"] = "fp16"
             # Write the DeepSpeed config to a file
             with open(ds_config_path, 'w') as f:
                 json.dump(deepspeed_config, f, indent=2)
             logger.info(f"Created DeepSpeed config at {ds_config_path}")
-            logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
-            # Enable CPU offloading if configured
-            if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
-                logger.info("DeepSpeed CPU offloading enabled for optimizer states")
             # Set using_deepspeed flag
             using_deepspeed = True
-        elif os.environ.get("DISABLE_DEEPSPEED_MPI", "0") == "1":
-            logger.warning("DeepSpeed MPI support is disabled due to missing mpi4py. Continuing without DeepSpeed.")
             ds_config_path = None
             using_deepspeed = False
         else:
-            logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
             ds_config_path = None
             using_deepspeed = False

 RESEARCH TRAINING PHASE ONLY - No output generation
 WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
 OPTIMIZED FOR L40S GPU (48GB VRAM)
+SUPPORTS ENVIRONMENTS WITHOUT MPI
 """
 # Set critical environment variables before any imports
 # L40S-specific CUDA optimization
 os.environ["CUDA_AUTO_BOOST"] = "1"
+# Explicitly disable DeepSpeed MPI requirement
+os.environ["DEEPSPEED_MPI_REQUIRED"] = "0"
 import json
 import logging
 import argparse
 from peft import LoraConfig
 from unsloth import FastLanguageModel
+# Configure logging first (before any potential errors with imports)
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler("training.log")
+    ]
+)
+logger = logging.getLogger(__name__)
+# Set up DeepSpeed without requiring MPI
 os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "9994"
 os.environ["RANK"] = "0"
 os.environ["LOCAL_RANK"] = "0"
 os.environ["WORLD_SIZE"] = "1"
+# Try to import deepspeed, with fallback for environments without MPI
+deepspeed_available = False
 try:
     import deepspeed
+    deepspeed_available = True
+    logger.info("DeepSpeed successfully imported")
 except ImportError as e:
+    logger.warning(f"Failed to import DeepSpeed: {e}")
+    logger.warning("Will continue without DeepSpeed support")
+    # Set a flag to disable DeepSpeed
+    os.environ["DISABLE_DEEPSPEED"] = "1"
 # Disable all attention optimizations that might cause issues
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Add our import blocker to sys.meta_path
 sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
 # Make sure torch is installed and available before proceeding
 try:
     logger.info("Importing torch...")
             per_device_train_batch_size = 2
             logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
+        # Check if DeepSpeed config is available and if DeepSpeed is available
         deepspeed_config = config.get("deepspeed_config", None)
+        if deepspeed_config and deepspeed_available and os.environ.get("DISABLE_DEEPSPEED", "0") != "1":
             logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
             # Create a temporary DeepSpeed config file
             if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
                 deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
             # Write the DeepSpeed config to a file
             with open(ds_config_path, 'w') as f:
                 json.dump(deepspeed_config, f, indent=2)
             logger.info(f"Created DeepSpeed config at {ds_config_path}")
             # Set using_deepspeed flag
             using_deepspeed = True
+        elif os.environ.get("DISABLE_DEEPSPEED", "0") == "1":
+            logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
             ds_config_path = None
             using_deepspeed = False
         else:
+            logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
             ds_config_path = None
             using_deepspeed = False