George-API commited on
Commit
fa0ae8d
·
verified ·
1 Parent(s): 0a1769d

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +62 -22
run_cloud_training.py CHANGED
@@ -7,7 +7,13 @@ RESEARCH TRAINING PHASE ONLY - No output generation
7
  WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
8
  """
9
 
 
10
  import os
 
 
 
 
 
11
  import json
12
  import logging
13
  import argparse
@@ -21,6 +27,7 @@ from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelFor
21
  from transformers.data.data_collator import DataCollatorMixin
22
  from peft import LoraConfig
23
  from unsloth import FastLanguageModel
 
24
 
25
  # Disable all attention optimizations that might cause issues
26
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
@@ -48,9 +55,6 @@ class XFormersBlocker:
48
  # Add our import blocker to sys.meta_path
49
  sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
50
 
51
- # Configure PyTorch memory allocator for better memory management
52
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
53
-
54
  # Configure logging first
55
  logging.basicConfig(
56
  level=logging.INFO,
@@ -401,20 +405,13 @@ def remove_training_marker():
401
  os.remove("TRAINING_ACTIVE")
402
  logger.info("Removed training active marker")
403
 
404
- def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False):
405
  """
406
  Load the model directly with HuggingFace, bypassing Unsloth optimizations
407
  to avoid memory-efficient attention issues
408
  """
409
  logger.info(f"Loading model: {model_name}")
410
 
411
- # Explicitly disable xformers and flash attention in environment
412
- os.environ["XFORMERS_DISABLED"] = "1"
413
- os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
414
-
415
- # Configure PyTorch memory allocator for better memory management with multiple GPUs
416
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
417
-
418
  # Create BitsAndBytesConfig for 4-bit quantization
419
  from transformers import BitsAndBytesConfig
420
  bnb_config = BitsAndBytesConfig(
@@ -449,10 +446,19 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
449
 
450
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
451
 
452
- # Use auto device mapping for multi-GPU setup
453
- device_map = "auto" if gpu_count > 1 else "auto"
454
- logger.info(f"Using device_map={device_map} for model distribution")
 
 
 
 
 
 
 
 
455
 
 
456
  model = AutoModelForCausalLM.from_pretrained(
457
  model_name,
458
  config=config,
@@ -462,7 +468,13 @@ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attentio
462
  trust_remote_code=True,
463
  attn_implementation=attn_implementation
464
  )
 
465
  logger.info("Model loaded successfully with standard HF loading")
 
 
 
 
 
466
  return model, tokenizer
467
 
468
  def train(config_path, dataset_name, output_dir):
@@ -471,14 +483,9 @@ def train(config_path, dataset_name, output_dir):
471
  load_dotenv()
472
  config = load_config(config_path)
473
 
474
- # Explicitly disable xformers and flash attention in environment
475
- os.environ["XFORMERS_DISABLED"] = "1"
476
- os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
477
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
478
 
479
- # Configure PyTorch memory allocator for better memory management with multiple GPUs
480
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
481
-
482
  # Try to unload xformers if it's loaded
483
  if 'xformers' in sys.modules:
484
  logger.info("Removing xformers from sys.modules")
@@ -561,10 +568,13 @@ def train(config_path, dataset_name, output_dir):
561
  # Force eager attention implementation
562
  use_flash_attention = False # Override to force eager implementation
563
 
 
 
 
564
  # Initialize model with our safe loading function
565
  logger.info("Loading pre-quantized model with eager attention")
566
  dtype = torch.float16 if hardware_config.get("fp16", True) else None
567
- model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention)
568
 
569
  # Disable generation capabilities for research training
570
  logger.info("Disabling generation capabilities - Research training only")
@@ -607,6 +617,35 @@ def train(config_path, dataset_name, output_dir):
607
  per_device_train_batch_size = 4 if gpu_count >= 4 else 2
608
  logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
609
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  training_args_dict = {
611
  "output_dir": output_dir,
612
  "num_train_epochs": training_config.get("num_train_epochs", 3),
@@ -628,7 +667,8 @@ def train(config_path, dataset_name, output_dir):
628
  "disable_tqdm": training_config.get("disable_tqdm", False),
629
  "remove_unused_columns": False,
630
  "seed": 42,
631
- "dataloader_num_workers": 4 # Use multiple workers for data loading
 
632
  }
633
 
634
  # Create TrainingArguments with validated parameters
 
7
  WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
8
  """
9
 
10
+ # Set critical environment variables before any imports
11
  import os
12
+ # Configure PyTorch memory allocator for better memory management with multiple GPUs
13
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
14
+ os.environ["XFORMERS_DISABLED"] = "1"
15
+ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
16
+
17
  import json
18
  import logging
19
  import argparse
 
27
  from transformers.data.data_collator import DataCollatorMixin
28
  from peft import LoraConfig
29
  from unsloth import FastLanguageModel
30
+ import deepspeed
31
 
32
  # Disable all attention optimizations that might cause issues
33
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 
55
  # Add our import blocker to sys.meta_path
56
  sys.meta_path.insert(0, XFormersBlocker(sys.meta_path[0]))
57
 
 
 
 
58
  # Configure logging first
59
  logging.basicConfig(
60
  level=logging.INFO,
 
405
  os.remove("TRAINING_ACTIVE")
406
  logger.info("Removed training active marker")
407
 
408
+ def load_model_safely(model_name, max_seq_length, dtype=None, use_flash_attention=False, use_deepspeed=False):
409
  """
410
  Load the model directly with HuggingFace, bypassing Unsloth optimizations
411
  to avoid memory-efficient attention issues
412
  """
413
  logger.info(f"Loading model: {model_name}")
414
 
 
 
 
 
 
 
 
415
  # Create BitsAndBytesConfig for 4-bit quantization
416
  from transformers import BitsAndBytesConfig
417
  bnb_config = BitsAndBytesConfig(
 
446
 
447
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
448
 
449
+ # Set device mapping based on whether DeepSpeed is used
450
+ # When using DeepSpeed, we should use 'cpu' or 'meta' for initial loading
451
+ # to avoid OOM issues, as DeepSpeed will handle the device placement
452
+ if use_deepspeed:
453
+ logger.info("Using DeepSpeed - loading model initially on CPU to avoid OOM issues")
454
+ device_map = "cpu" # Load on CPU first, DeepSpeed will handle distribution
455
+ else:
456
+ # Always use auto device mapping for cloud hardware when not using DeepSpeed
457
+ device_map = "auto"
458
+
459
+ logger.info(f"Using device_map={device_map} for initial model loading")
460
 
461
+ # Load the model
462
  model = AutoModelForCausalLM.from_pretrained(
463
  model_name,
464
  config=config,
 
468
  trust_remote_code=True,
469
  attn_implementation=attn_implementation
470
  )
471
+
472
  logger.info("Model loaded successfully with standard HF loading")
473
+
474
+ # If using DeepSpeed, ensure model is properly prepared
475
+ if use_deepspeed:
476
+ logger.info("Model loaded on CPU - DeepSpeed will handle device placement during training")
477
+
478
  return model, tokenizer
479
 
480
  def train(config_path, dataset_name, output_dir):
 
483
  load_dotenv()
484
  config = load_config(config_path)
485
 
486
+ # Set CUDA launch blocking for better error reporting
 
 
487
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
488
 
 
 
 
489
  # Try to unload xformers if it's loaded
490
  if 'xformers' in sys.modules:
491
  logger.info("Removing xformers from sys.modules")
 
568
  # Force eager attention implementation
569
  use_flash_attention = False # Override to force eager implementation
570
 
571
+ # Check if we're using DeepSpeed
572
+ using_deepspeed = ds_config_path is not None
573
+
574
  # Initialize model with our safe loading function
575
  logger.info("Loading pre-quantized model with eager attention")
576
  dtype = torch.float16 if hardware_config.get("fp16", True) else None
577
+ model, tokenizer = load_model_safely(model_name, max_seq_length, dtype, use_flash_attention, use_deepspeed=using_deepspeed)
578
 
579
  # Disable generation capabilities for research training
580
  logger.info("Disabling generation capabilities - Research training only")
 
617
  per_device_train_batch_size = 4 if gpu_count >= 4 else 2
618
  logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
619
 
620
+ # Check if DeepSpeed config is available
621
+ deepspeed_config = config.get("deepspeed_config", None)
622
+ if deepspeed_config:
623
+ logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
624
+
625
+ # Create a temporary DeepSpeed config file
626
+ ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
627
+
628
+ # Update DeepSpeed config with dynamic values
629
+ if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
630
+ deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
631
+
632
+ if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
633
+ deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
634
+
635
+ # Write the DeepSpeed config to a file
636
+ with open(ds_config_path, 'w') as f:
637
+ json.dump(deepspeed_config, f, indent=2)
638
+
639
+ logger.info(f"Created DeepSpeed config at {ds_config_path}")
640
+ logger.info(f"DeepSpeed ZeRO Stage: {deepspeed_config.get('zero_optimization', {}).get('stage', 'Not specified')}")
641
+
642
+ # Enable CPU offloading if configured
643
+ if deepspeed_config.get("zero_optimization", {}).get("offload_optimizer", {}).get("device") == "cpu":
644
+ logger.info("DeepSpeed CPU offloading enabled for optimizer states")
645
+ else:
646
+ logger.warning("No DeepSpeed configuration found - continuing without DeepSpeed")
647
+ ds_config_path = None
648
+
649
  training_args_dict = {
650
  "output_dir": output_dir,
651
  "num_train_epochs": training_config.get("num_train_epochs", 3),
 
667
  "disable_tqdm": training_config.get("disable_tqdm", False),
668
  "remove_unused_columns": False,
669
  "seed": 42,
670
+ "dataloader_num_workers": 4, # Use multiple workers for data loading
671
+ "deepspeed": ds_config_path # Add DeepSpeed config path if available
672
  }
673
 
674
  # Create TrainingArguments with validated parameters