Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +62 -22
run_cloud_training.py
CHANGED
|
@@ -7,7 +7,13 @@ RESEARCH TRAINING PHASE ONLY - No output generation
|
|
| 7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
| 8 |
"""
|
| 9 |
|
|
|
|
| 10 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
import json
|
| 12 |
import logging
|
| 13 |
import argparse
|
|
@@ -21,6 +27,7 @@ from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelFor
|
|
| 21 |
from transformers.data.data_collator import DataCollatorMixin
|
| 22 |
from peft import LoraConfig
|
| 23 |
from unsloth import FastLanguageModel
|
|
|
|
| 24 |
|
| 25 |
# Disable all attention optimizations that might cause issues
|
| 26 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
@@ -48,9 +55,6 @@ class XFormersBlocker:
|
|
| 48 |
# Add our import blocker to sys.meta_path
|
| 49 |
sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
|
| 50 |
|
| 51 |
-
# Configure PyTorch memory allocator for better memory management
|
| 52 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 53 |
-
|
| 54 |
# Configure logging first
|
| 55 |
logging.basicConfig(
|
| 56 |
level=logging.INFO,
|
|
@@ -401,20 +405,13 @@ def remove_training_marker():
|
|
| 401 |
os.remove("TRAINING_ACTIVE")
|
| 402 |
logger.info("Removed training active marker")
|
| 403 |
|
| 404 |
-
def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False):
|
| 405 |
"""
|
| 406 |
Load the model directly with HuggingFace, bypassing Unsloth optimizations
|
| 407 |
to avoid memory-efficient attention issues
|
| 408 |
"""
|
| 409 |
logger.info(f"Loading model: {model_name}")
|
| 410 |
|
| 411 |
-
# Explicitly disable xformers and flash attention in environment
|
| 412 |
-
os.environ["XFORMERS_DISABLED"] = "1"
|
| 413 |
-
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
| 414 |
-
|
| 415 |
-
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
| 416 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 417 |
-
|
| 418 |
# Create BitsAndBytesConfig for 4-bit quantization
|
| 419 |
from transformers import BitsAndBytesConfig
|
| 420 |
bnb_config = BitsAndBytesConfig(
|
|
@@ -449,10 +446,19 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
| 449 |
|
| 450 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 451 |
|
| 452 |
-
#
|
| 453 |
-
|
| 454 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
|
|
|
|
| 456 |
model = AutoModelForCausalLM.from_pretrained(
|
| 457 |
model_name,
|
| 458 |
config=config,
|
|
@@ -462,7 +468,13 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
| 462 |
trust_remote_code=True,
|
| 463 |
attn_implementation=attn_implementation
|
| 464 |
)
|
|
|
|
| 465 |
logger.info("Model loaded successfully with standard HF loading")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
return model, tokenizer
|
| 467 |
|
| 468 |
def train(config_path, dataset_name, output_dir):
|
|
@@ -471,14 +483,9 @@ def train(config_path, dataset_name, output_dir):
|
|
| 471 |
load_dotenv()
|
| 472 |
config = load_config(config_path)
|
| 473 |
|
| 474 |
-
#
|
| 475 |
-
os.environ["XFORMERS_DISABLED"] = "1"
|
| 476 |
-
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
| 477 |
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 478 |
|
| 479 |
-
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
| 480 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 481 |
-
|
| 482 |
# Try to unload xformers if it's loaded
|
| 483 |
if 'xformers' in sys.modules:
|
| 484 |
logger.info("Removing xformers from sys.modules")
|
|
@@ -561,10 +568,13 @@ def train(config_path, dataset_name, output_dir):
|
|
| 561 |
# Force eager attention implementation
|
| 562 |
use_flash_attention = False # Override to force eager implementation
|
| 563 |
|
|
|
|
|
|
|
|
|
|
| 564 |
# Initialize model with our safe loading function
|
| 565 |
logger.info("Loading pre-quantized model with eager attention")
|
| 566 |
dtype = torch.float16 if hardware_config.get("fp16", True) else None
|
| 567 |
-
model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention)
|
| 568 |
|
| 569 |
# Disable generation capabilities for research training
|
| 570 |
logger.info("Disabling generation capabilities - Research training only")
|
|
@@ -607,6 +617,35 @@ def train(config_path, dataset_name, output_dir):
|
|
| 607 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
| 608 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
| 609 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
training_args_dict = {
|
| 611 |
"output_dir": output_dir,
|
| 612 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
|
@@ -628,7 +667,8 @@ def train(config_path, dataset_name, output_dir):
|
|
| 628 |
"disable_tqdm": training_config.get("disable_tqdm", False),
|
| 629 |
"remove_unused_columns": False,
|
| 630 |
"seed": 42,
|
| 631 |
-
"dataloader_num_workers": 4 # Use multiple workers for data loading
|
|
|
|
| 632 |
}
|
| 633 |
|
| 634 |
# Create TrainingArguments with validated parameters
|
|
|
|
| 7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
| 8 |
"""
|
| 9 |
|
| 10 |
+
# Set critical environment variables before any imports
|
| 11 |
import os
|
| 12 |
+
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
| 13 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 14 |
+
os.environ["XFORMERS_DISABLED"] = "1"
|
| 15 |
+
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
| 16 |
+
|
| 17 |
import json
|
| 18 |
import logging
|
| 19 |
import argparse
|
|
|
|
| 27 |
from transformers.data.data_collator import DataCollatorMixin
|
| 28 |
from peft import LoraConfig
|
| 29 |
from unsloth import FastLanguageModel
|
| 30 |
+
import deepspeed
|
| 31 |
|
| 32 |
# Disable all attention optimizations that might cause issues
|
| 33 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
|
|
| 55 |
# Add our import blocker to sys.meta_path
|
| 56 |
sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
|
| 57 |
|
|
|
|
|
|
|
|
|
|
| 58 |
# Configure logging first
|
| 59 |
logging.basicConfig(
|
| 60 |
level=logging.INFO,
|
|
|
|
| 405 |
os.remove("TRAINING_ACTIVE")
|
| 406 |
logger.info("Removed training active marker")
|
| 407 |
|
| 408 |
+
def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False, use_deepspeed=False):
|
| 409 |
"""
|
| 410 |
Load the model directly with HuggingFace, bypassing Unsloth optimizations
|
| 411 |
to avoid memory-efficient attention issues
|
| 412 |
"""
|
| 413 |
logger.info(f"Loading model: {model_name}")
|
| 414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
# Create BitsAndBytesConfig for 4-bit quantization
|
| 416 |
from transformers import BitsAndBytesConfig
|
| 417 |
bnb_config = BitsAndBytesConfig(
|
|
|
|
| 446 |
|
| 447 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 448 |
|
| 449 |
+
# Set device mapping based on whether DeepSpeed is used
|
| 450 |
+
# When using DeepSpeed, we should use 'cpu' or 'meta' for initial loading
|
| 451 |
+
# to avoid OOM issues, as DeepSpeed will handle the device placement
|
| 452 |
+
if use_deepspeed:
|
| 453 |
+
logger.info("Using DeepSpeed - loading model initially on CPU to avoid OOM issues")
|
| 454 |
+
device_map = "cpu" # Load on CPU first, DeepSpeed will handle distribution
|
| 455 |
+
else:
|
| 456 |
+
# Always use auto device mapping for cloud hardware when not using DeepSpeed
|
| 457 |
+
device_map = "auto"
|
| 458 |
+
|
| 459 |
+
logger.info(f"Using device_map={device_map} for initial model loading")
|
| 460 |
|
| 461 |
+
# Load the model
|
| 462 |
model = AutoModelForCausalLM.from_pretrained(
|
| 463 |
model_name,
|
| 464 |
config=config,
|
|
|
|
| 468 |
trust_remote_code=True,
|
| 469 |
attn_implementation=attn_implementation
|
| 470 |
)
|
| 471 |
+
|
| 472 |
logger.info("Model loaded successfully with standard HF loading")
|
| 473 |
+
|
| 474 |
+
# If using DeepSpeed, ensure model is properly prepared
|
| 475 |
+
if use_deepspeed:
|
| 476 |
+
logger.info("Model loaded on CPU - DeepSpeed will handle device placement during training")
|
| 477 |
+
|
| 478 |
return model, tokenizer
|
| 479 |
|
| 480 |
def train(config_path, dataset_name, output_dir):
|
|
|
|
| 483 |
load_dotenv()
|
| 484 |
config = load_config(config_path)
|
| 485 |
|
| 486 |
+
# Set CUDA launch blocking for better error reporting
|
|
|
|
|
|
|
| 487 |
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 488 |
|
|
|
|
|
|
|
|
|
|
| 489 |
# Try to unload xformers if it's loaded
|
| 490 |
if 'xformers' in sys.modules:
|
| 491 |
logger.info("Removing xformers from sys.modules")
|
|
|
|
| 568 |
# Force eager attention implementation
|
| 569 |
use_flash_attention = False # Override to force eager implementation
|
| 570 |
|
| 571 |
+
# Check if we're using DeepSpeed
|
| 572 |
+
using_deepspeed = ds_config_path is not None
|
| 573 |
+
|
| 574 |
# Initialize model with our safe loading function
|
| 575 |
logger.info("Loading pre-quantized model with eager attention")
|
| 576 |
dtype = torch.float16 if hardware_config.get("fp16", True) else None
|
| 577 |
+
model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention, use_deepspeed=using_deepspeed)
|
| 578 |
|
| 579 |
# Disable generation capabilities for research training
|
| 580 |
logger.info("Disabling generation capabilities - Research training only")
|
|
|
|
| 617 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
| 618 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
| 619 |
|
| 620 |
+
# Check if DeepSpeed config is available
|
| 621 |
+
deepspeed_config = config.get("deepspeed_config", None)
|
| 622 |
+
if deepspeed_config:
|
| 623 |
+
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
| 624 |
+
|
| 625 |
+
# Create a temporary DeepSpeed config file
|
| 626 |
+
ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
|
| 627 |
+
|
| 628 |
+
# Update DeepSpeed config with dynamic values
|
| 629 |
+
if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
|
| 630 |
+
deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
|
| 631 |
+
|
| 632 |
+
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
| 633 |
+
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
| 634 |
+
|
| 635 |
+
# Write the DeepSpeed config to a file
|
| 636 |
+
with open(ds_config_path, 'w') as f:
|
| 637 |
+
json.dump(deepspeed_config, f, indent=2)
|
| 638 |
+
|
| 639 |
+
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
| 640 |
+
logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
|
| 641 |
+
|
| 642 |
+
# Enable CPU offloading if configured
|
| 643 |
+
if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
|
| 644 |
+
logger.info("DeepSpeed CPU offloading enabled for optimizer states")
|
| 645 |
+
else:
|
| 646 |
+
logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
|
| 647 |
+
ds_config_path = None
|
| 648 |
+
|
| 649 |
training_args_dict = {
|
| 650 |
"output_dir": output_dir,
|
| 651 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
|
|
|
| 667 |
"disable_tqdm": training_config.get("disable_tqdm", False),
|
| 668 |
"remove_unused_columns": False,
|
| 669 |
"seed": 42,
|
| 670 |
+
"dataloader_num_workers": 4, # Use multiple workers for data loading
|
| 671 |
+
"deepspeed": ds_config_path # Add DeepSpeed config path if available
|
| 672 |
}
|
| 673 |
|
| 674 |
# Create TrainingArguments with validated parameters
|