Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +62 -22
run_cloud_training.py
CHANGED
@@ -7,7 +7,13 @@ RESEARCH TRAINING PHASE ONLY - No output generation
|
|
7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
8 |
"""
|
9 |
|
|
|
10 |
import os
|
|
|
|
|
|
|
|
|
|
|
11 |
import json
|
12 |
import logging
|
13 |
import argparse
|
@@ -21,6 +27,7 @@ from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelFor
|
|
21 |
from transformers.data.data_collator import DataCollatorMixin
|
22 |
from peft import LoraConfig
|
23 |
from unsloth import FastLanguageModel
|
|
|
24 |
|
25 |
# Disable all attention optimizations that might cause issues
|
26 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
@@ -48,9 +55,6 @@ class XFormersBlocker:
|
|
48 |
# Add our import blocker to sys.meta_path
|
49 |
sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
|
50 |
|
51 |
-
# Configure PyTorch memory allocator for better memory management
|
52 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
53 |
-
|
54 |
# Configure logging first
|
55 |
logging.basicConfig(
|
56 |
level=logging.INFO,
|
@@ -401,20 +405,13 @@ def remove_training_marker():
|
|
401 |
os.remove("TRAINING_ACTIVE")
|
402 |
logger.info("Removed training active marker")
|
403 |
|
404 |
-
def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False):
|
405 |
"""
|
406 |
Load the model directly with HuggingFace, bypassing Unsloth optimizations
|
407 |
to avoid memory-efficient attention issues
|
408 |
"""
|
409 |
logger.info(f"Loading model: {model_name}")
|
410 |
|
411 |
-
# Explicitly disable xformers and flash attention in environment
|
412 |
-
os.environ["XFORMERS_DISABLED"] = "1"
|
413 |
-
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
414 |
-
|
415 |
-
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
416 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
417 |
-
|
418 |
# Create BitsAndBytesConfig for 4-bit quantization
|
419 |
from transformers import BitsAndBytesConfig
|
420 |
bnb_config = BitsAndBytesConfig(
|
@@ -449,10 +446,19 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
449 |
|
450 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
451 |
|
452 |
-
#
|
453 |
-
|
454 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
|
|
|
456 |
model = AutoModelForCausalLM.from_pretrained(
|
457 |
model_name,
|
458 |
config=config,
|
@@ -462,7 +468,13 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
|
|
462 |
trust_remote_code=True,
|
463 |
attn_implementation=attn_implementation
|
464 |
)
|
|
|
465 |
logger.info("Model loaded successfully with standard HF loading")
|
|
|
|
|
|
|
|
|
|
|
466 |
return model, tokenizer
|
467 |
|
468 |
def train(config_path, dataset_name, output_dir):
|
@@ -471,14 +483,9 @@ def train(config_path, dataset_name, output_dir):
|
|
471 |
load_dotenv()
|
472 |
config = load_config(config_path)
|
473 |
|
474 |
-
#
|
475 |
-
os.environ["XFORMERS_DISABLED"] = "1"
|
476 |
-
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
477 |
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
478 |
|
479 |
-
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
480 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
481 |
-
|
482 |
# Try to unload xformers if it's loaded
|
483 |
if 'xformers' in sys.modules:
|
484 |
logger.info("Removing xformers from sys.modules")
|
@@ -561,10 +568,13 @@ def train(config_path, dataset_name, output_dir):
|
|
561 |
# Force eager attention implementation
|
562 |
use_flash_attention = False # Override to force eager implementation
|
563 |
|
|
|
|
|
|
|
564 |
# Initialize model with our safe loading function
|
565 |
logger.info("Loading pre-quantized model with eager attention")
|
566 |
dtype = torch.float16 if hardware_config.get("fp16", True) else None
|
567 |
-
model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention)
|
568 |
|
569 |
# Disable generation capabilities for research training
|
570 |
logger.info("Disabling generation capabilities - Research training only")
|
@@ -607,6 +617,35 @@ def train(config_path, dataset_name, output_dir):
|
|
607 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
608 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
609 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
610 |
training_args_dict = {
|
611 |
"output_dir": output_dir,
|
612 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
@@ -628,7 +667,8 @@ def train(config_path, dataset_name, output_dir):
|
|
628 |
"disable_tqdm": training_config.get("disable_tqdm", False),
|
629 |
"remove_unused_columns": False,
|
630 |
"seed": 42,
|
631 |
-
"dataloader_num_workers": 4 # Use multiple workers for data loading
|
|
|
632 |
}
|
633 |
|
634 |
# Create TrainingArguments with validated parameters
|
|
|
7 |
WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
|
8 |
"""
|
9 |
|
10 |
+
# Set critical environment variables before any imports
|
11 |
import os
|
12 |
+
# Configure PyTorch memory allocator for better memory management with multiple GPUs
|
13 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
14 |
+
os.environ["XFORMERS_DISABLED"] = "1"
|
15 |
+
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
16 |
+
|
17 |
import json
|
18 |
import logging
|
19 |
import argparse
|
|
|
27 |
from transformers.data.data_collator import DataCollatorMixin
|
28 |
from peft import LoraConfig
|
29 |
from unsloth import FastLanguageModel
|
30 |
+
import deepspeed
|
31 |
|
32 |
# Disable all attention optimizations that might cause issues
|
33 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
|
55 |
# Add our import blocker to sys.meta_path
|
56 |
sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
|
57 |
|
|
|
|
|
|
|
58 |
# Configure logging first
|
59 |
logging.basicConfig(
|
60 |
level=logging.INFO,
|
|
|
405 |
os.remove("TRAINING_ACTIVE")
|
406 |
logger.info("Removed training active marker")
|
407 |
|
408 |
+
def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False, use_deepspeed=False):
|
409 |
"""
|
410 |
Load the model directly with HuggingFace, bypassing Unsloth optimizations
|
411 |
to avoid memory-efficient attention issues
|
412 |
"""
|
413 |
logger.info(f"Loading model: {model_name}")
|
414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
415 |
# Create BitsAndBytesConfig for 4-bit quantization
|
416 |
from transformers import BitsAndBytesConfig
|
417 |
bnb_config = BitsAndBytesConfig(
|
|
|
446 |
|
447 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
448 |
|
449 |
+
# Set device mapping based on whether DeepSpeed is used
|
450 |
+
# When using DeepSpeed, we should use 'cpu' or 'meta' for initial loading
|
451 |
+
# to avoid OOM issues, as DeepSpeed will handle the device placement
|
452 |
+
if use_deepspeed:
|
453 |
+
logger.info("Using DeepSpeed - loading model initially on CPU to avoid OOM issues")
|
454 |
+
device_map = "cpu" # Load on CPU first, DeepSpeed will handle distribution
|
455 |
+
else:
|
456 |
+
# Always use auto device mapping for cloud hardware when not using DeepSpeed
|
457 |
+
device_map = "auto"
|
458 |
+
|
459 |
+
logger.info(f"Using device_map={device_map} for initial model loading")
|
460 |
|
461 |
+
# Load the model
|
462 |
model = AutoModelForCausalLM.from_pretrained(
|
463 |
model_name,
|
464 |
config=config,
|
|
|
468 |
trust_remote_code=True,
|
469 |
attn_implementation=attn_implementation
|
470 |
)
|
471 |
+
|
472 |
logger.info("Model loaded successfully with standard HF loading")
|
473 |
+
|
474 |
+
# If using DeepSpeed, ensure model is properly prepared
|
475 |
+
if use_deepspeed:
|
476 |
+
logger.info("Model loaded on CPU - DeepSpeed will handle device placement during training")
|
477 |
+
|
478 |
return model, tokenizer
|
479 |
|
480 |
def train(config_path, dataset_name, output_dir):
|
|
|
483 |
load_dotenv()
|
484 |
config = load_config(config_path)
|
485 |
|
486 |
+
# Set CUDA launch blocking for better error reporting
|
|
|
|
|
487 |
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
488 |
|
|
|
|
|
|
|
489 |
# Try to unload xformers if it's loaded
|
490 |
if 'xformers' in sys.modules:
|
491 |
logger.info("Removing xformers from sys.modules")
|
|
|
568 |
# Force eager attention implementation
|
569 |
use_flash_attention = False # Override to force eager implementation
|
570 |
|
571 |
+
# Check if we're using DeepSpeed
|
572 |
+
using_deepspeed = ds_config_path is not None
|
573 |
+
|
574 |
# Initialize model with our safe loading function
|
575 |
logger.info("Loading pre-quantized model with eager attention")
|
576 |
dtype = torch.float16 if hardware_config.get("fp16", True) else None
|
577 |
+
model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention, use_deepspeed=using_deepspeed)
|
578 |
|
579 |
# Disable generation capabilities for research training
|
580 |
logger.info("Disabling generation capabilities - Research training only")
|
|
|
617 |
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
618 |
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
619 |
|
620 |
+
# Check if DeepSpeed config is available
|
621 |
+
deepspeed_config = config.get("deepspeed_config", None)
|
622 |
+
if deepspeed_config:
|
623 |
+
logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
|
624 |
+
|
625 |
+
# Create a temporary DeepSpeed config file
|
626 |
+
ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
|
627 |
+
|
628 |
+
# Update DeepSpeed config with dynamic values
|
629 |
+
if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
|
630 |
+
deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
|
631 |
+
|
632 |
+
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
633 |
+
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
634 |
+
|
635 |
+
# Write the DeepSpeed config to a file
|
636 |
+
with open(ds_config_path, 'w') as f:
|
637 |
+
json.dump(deepspeed_config, f, indent=2)
|
638 |
+
|
639 |
+
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
640 |
+
logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
|
641 |
+
|
642 |
+
# Enable CPU offloading if configured
|
643 |
+
if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
|
644 |
+
logger.info("DeepSpeed CPU offloading enabled for optimizer states")
|
645 |
+
else:
|
646 |
+
logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
|
647 |
+
ds_config_path = None
|
648 |
+
|
649 |
training_args_dict = {
|
650 |
"output_dir": output_dir,
|
651 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
|
|
667 |
"disable_tqdm": training_config.get("disable_tqdm", False),
|
668 |
"remove_unused_columns": False,
|
669 |
"seed": 42,
|
670 |
+
"dataloader_num_workers": 4, # Use multiple workers for data loading
|
671 |
+
"deepspeed": ds_config_path # Add DeepSpeed config path if available
|
672 |
}
|
673 |
|
674 |
# Create TrainingArguments with validated parameters
|