Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 11

Commit

fa0ae8d

verified ·

1 Parent(s): 0a1769d

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +62 -22

run_cloud_training.py CHANGED Viewed

@@ -7,7 +7,13 @@ RESEARCH TRAINING PHASE ONLY - No output generation
 WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
 """
 import os
 import json
 import logging
 import argparse
@@ -21,6 +27,7 @@ from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelFor
 from transformers.data.data_collator import DataCollatorMixin
 from peft import LoraConfig
 from unsloth import FastLanguageModel
 # Disable all attention optimizations that might cause issues
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
@@ -48,9 +55,6 @@ class XFormersBlocker:
 # Add our import blocker to sys.meta_path
 sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
-# Configure PyTorch memory allocator for better memory management
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # Configure logging first
 logging.basicConfig(
     level=logging.INFO,
@@ -401,20 +405,13 @@ def remove_training_marker():
         os.remove("TRAINING_ACTIVE")
         logger.info("Removed training active marker")
-def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False):
     """
     Load the model directly with HuggingFace, bypassing Unsloth optimizations
     to avoid memory-efficient attention issues
     """
     logger.info(f"Loading model: {model_name}")
-    # Explicitly disable xformers and flash attention in environment
-    os.environ["XFORMERS_DISABLED"] = "1"
-    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
-    # Configure PyTorch memory allocator for better memory management with multiple GPUs
-    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
     # Create BitsAndBytesConfig for 4-bit quantization
     from transformers import BitsAndBytesConfig
     bnb_config = BitsAndBytesConfig(
@@ -449,10 +446,19 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    # Use auto device mapping for multi-GPU setup
-    device_map = "auto" if gpu_count > 1 else "auto"
-    logger.info(f"Using device_map={device_map} for model distribution")
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         config=config,
@@ -462,7 +468,13 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
         trust_remote_code=True,
         attn_implementation=attn_implementation
     )
     logger.info("Model loaded successfully with standard HF loading")
     return model, tokenizer
 def train(config_path, dataset_name, output_dir):
@@ -471,14 +483,9 @@ def train(config_path, dataset_name, output_dir):
     load_dotenv()
     config = load_config(config_path)
-    # Explicitly disable xformers and flash attention in environment
-    os.environ["XFORMERS_DISABLED"] = "1"
-    os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
     os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-    # Configure PyTorch memory allocator for better memory management with multiple GPUs
-    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
     # Try to unload xformers if it's loaded
     if 'xformers' in sys.modules:
         logger.info("Removing xformers from sys.modules")
@@ -561,10 +568,13 @@ def train(config_path, dataset_name, output_dir):
         # Force eager attention implementation
         use_flash_attention = False  # Override to force eager implementation
         # Initialize model with our safe loading function
         logger.info("Loading pre-quantized model with eager attention")
         dtype = torch.float16 if hardware_config.get("fp16", True) else None
-        model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention)
         # Disable generation capabilities for research training
         logger.info("Disabling generation capabilities - Research training only")
@@ -607,6 +617,35 @@ def train(config_path, dataset_name, output_dir):
         per_device_train_batch_size = 4 if gpu_count >= 4 else 2
         logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
         training_args_dict = {
             "output_dir": output_dir,
             "num_train_epochs": training_config.get("num_train_epochs", 3),
@@ -628,7 +667,8 @@ def train(config_path, dataset_name, output_dir):
             "disable_tqdm": training_config.get("disable_tqdm", False),
             "remove_unused_columns": False,
             "seed": 42,
-            "dataloader_num_workers": 4  # Use multiple workers for data loading
         }
         # Create TrainingArguments with validated parameters

 WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
 """
+# Set critical environment variables before any imports
 import os
+# Configure PyTorch memory allocator for better memory management with multiple GPUs
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["XFORMERS_DISABLED"] = "1"
+os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 import json
 import logging
 import argparse
 from transformers.data.data_collator import DataCollatorMixin
 from peft import LoraConfig
 from unsloth import FastLanguageModel
+import deepspeed
 # Disable all attention optimizations that might cause issues
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Add our import blocker to sys.meta_path
 sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
 # Configure logging first
 logging.basicConfig(
     level=logging.INFO,
         os.remove("TRAINING_ACTIVE")
         logger.info("Removed training active marker")
+def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False, use_deepspeed=False):
     """
     Load the model directly with HuggingFace, bypassing Unsloth optimizations
     to avoid memory-efficient attention issues
     """
     logger.info(f"Loading model: {model_name}")
     # Create BitsAndBytesConfig for 4-bit quantization
     from transformers import BitsAndBytesConfig
     bnb_config = BitsAndBytesConfig(
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    # Set device mapping based on whether DeepSpeed is used
+    # When using DeepSpeed, we should use 'cpu' or 'meta' for initial loading
+    # to avoid OOM issues, as DeepSpeed will handle the device placement
+    if use_deepspeed:
+        logger.info("Using DeepSpeed - loading model initially on CPU to avoid OOM issues")
+        device_map = "cpu"  # Load on CPU first, DeepSpeed will handle distribution
+    else:
+        # Always use auto device mapping for cloud hardware when not using DeepSpeed
+        device_map = "auto"
+    logger.info(f"Using device_map={device_map} for initial model loading")
+    # Load the model
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         config=config,
         trust_remote_code=True,
         attn_implementation=attn_implementation
     )
     logger.info("Model loaded successfully with standard HF loading")
+    # If using DeepSpeed, ensure model is properly prepared
+    if use_deepspeed:
+        logger.info("Model loaded on CPU - DeepSpeed will handle device placement during training")
     return model, tokenizer
 def train(config_path, dataset_name, output_dir):
     load_dotenv()
     config = load_config(config_path)
+    # Set CUDA launch blocking for better error reporting
     os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
     # Try to unload xformers if it's loaded
     if 'xformers' in sys.modules:
         logger.info("Removing xformers from sys.modules")
         # Force eager attention implementation
         use_flash_attention = False  # Override to force eager implementation
+        # Check if we're using DeepSpeed
+        using_deepspeed = ds_config_path is not None
         # Initialize model with our safe loading function
         logger.info("Loading pre-quantized model with eager attention")
         dtype = torch.float16 if hardware_config.get("fp16", True) else None
+        model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention, use_deepspeed=using_deepspeed)
         # Disable generation capabilities for research training
         logger.info("Disabling generation capabilities - Research training only")
         per_device_train_batch_size = 4 if gpu_count >= 4 else 2
         logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
+        # Check if DeepSpeed config is available
+        deepspeed_config = config.get("deepspeed_config", None)
+        if deepspeed_config:
+            logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
+            # Create a temporary DeepSpeed config file
+            ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
+            # Update DeepSpeed config with dynamic values
+            if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
+                deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
+            if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
+                deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
+            # Write the DeepSpeed config to a file
+            with open(ds_config_path, 'w') as f:
+                json.dump(deepspeed_config, f, indent=2)
+            logger.info(f"Created DeepSpeed config at {ds_config_path}")
+            logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
+            # Enable CPU offloading if configured
+            if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
+                logger.info("DeepSpeed CPU offloading enabled for optimizer states")
+        else:
+            logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
+            ds_config_path = None
         training_args_dict = {
             "output_dir": output_dir,
             "num_train_epochs": training_config.get("num_train_epochs", 3),
             "disable_tqdm": training_config.get("disable_tqdm", False),
             "remove_unused_columns": False,
             "seed": 42,
+            "dataloader_num_workers": 4,  # Use multiple workers for data loading
+            "deepspeed": ds_config_path  # Add DeepSpeed config path if available
         }
         # Create TrainingArguments with validated parameters