George-API commited on
Commit
f3c357b
·
verified ·
1 Parent(s): 6b2c2bc

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +92 -3
run_cloud_training.py CHANGED
@@ -14,6 +14,7 @@ import argparse
14
  import numpy as np
15
  from dotenv import load_dotenv
16
  import torch
 
17
  from datasets import load_dataset
18
  import transformers
19
  from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, AutoConfig
@@ -26,6 +27,21 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
26
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
27
  os.environ["XFORMERS_DISABLED"] = "1"
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Configure PyTorch memory allocator for better memory management
30
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
31
 
@@ -391,6 +407,41 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
391
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
392
  os.environ["XFORMERS_DISABLED"] = "1"
393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  try:
395
  logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
396
 
@@ -412,7 +463,9 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
412
  max_seq_length=max_seq_length,
413
  dtype=dtype,
414
  quantization_config=bnb_config,
415
- attn_implementation="eager" # Force eager attention
 
 
416
  )
417
  logger.info("Model loaded successfully with unsloth")
418
 
@@ -424,6 +477,8 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
424
  model.config.use_flash_attention = False
425
  if hasattr(model.config, 'use_flash_attention_2'):
426
  model.config.use_flash_attention_2 = False
 
 
427
 
428
  return model, tokenizer
429
 
@@ -432,7 +487,11 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
432
  logger.info("Falling back to standard Hugging Face loading...")
433
 
434
  # We'll try with HF loading
435
- attn_params = {"attn_implementation": "eager"} # Always use eager
 
 
 
 
436
 
437
  # Approach 1: Using attn_implementation parameter (newer method)
438
  try:
@@ -446,6 +505,8 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
446
  config.use_flash_attention = False
447
  if hasattr(config, 'use_flash_attention_2'):
448
  config.use_flash_attention_2 = False
 
 
449
 
450
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
451
 
@@ -476,6 +537,8 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
476
  config.use_flash_attention = False
477
  if hasattr(config, 'use_flash_attention_2'):
478
  config.use_flash_attention_2 = False
 
 
479
 
480
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
481
 
@@ -486,7 +549,10 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
486
  device_map="auto",
487
  torch_dtype=dtype or torch.float16,
488
  quantization_config=bnb_config,
489
- trust_remote_code=True
 
 
 
490
  )
491
  logger.info("Model loaded successfully with basic HF loading")
492
  return model, tokenizer
@@ -513,6 +579,28 @@ def train(config_path, dataset_name, output_dir):
513
  os.environ["XFORMERS_DISABLED"] = "1"
514
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
515
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  # Update flash attention setting to always use eager
517
  global flash_attention_available
518
  flash_attention_available = False
@@ -521,6 +609,7 @@ def train(config_path, dataset_name, output_dir):
521
  # Update hardware config to ensure eager attention
522
  hardware_config["attn_implementation"] = "eager"
523
  hardware_config["use_flash_attention"] = False
 
524
 
525
  # Verify this is training phase only
526
  training_phase_only = dataset_config.get("training_phase_only", True)
 
14
  import numpy as np
15
  from dotenv import load_dotenv
16
  import torch
17
+ import sys
18
  from datasets import load_dataset
19
  import transformers
20
  from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, AutoConfig
 
27
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
28
  os.environ["XFORMERS_DISABLED"] = "1"
29
 
30
+ # Completely disable xformers by removing it from sys.modules if it's loaded
31
+ if 'xformers' in sys.modules:
32
+ del sys.modules['xformers']
33
+ if 'xformers.ops' in sys.modules:
34
+ del sys.modules['xformers.ops']
35
+
36
+ # Patch transformers to prevent xformers import
37
+ def prevent_xformers_import(name, *args, **kwargs):
38
+ if 'xformers' in name:
39
+ raise ImportError(f"Import of {name} prevented")
40
+ return original_import(name, *args, **kwargs)
41
+
42
+ original_import = __import__
43
+ __builtins__['__import__'] = prevent_xformers_import
44
+
45
  # Configure PyTorch memory allocator for better memory management
46
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
47
 
 
407
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
408
  os.environ["XFORMERS_DISABLED"] = "1"
409
 
410
+ # Patch transformers attention implementation
411
+ try:
412
+ # Try to patch transformers attention implementation to avoid xformers
413
+ import transformers.models.llama.modeling_llama as llama_modeling
414
+
415
+ # Store original attention implementation
416
+ if not hasattr(llama_modeling, '_original_forward'):
417
+ # Only patch if not already patched
418
+ logger.info("Patching LLaMA attention implementation to avoid xformers")
419
+
420
+ # Store original implementation
421
+ if hasattr(llama_modeling.LlamaAttention, 'forward'):
422
+ llama_modeling._original_forward = llama_modeling.LlamaAttention.forward
423
+
424
+ # Define a new forward method that doesn't use xformers
425
+ def safe_attention_forward(self, hidden_states, attention_mask=None, position_ids=None, past_key_value=None, output_attentions=False, use_cache=False):
426
+ logger.info("Using safe attention implementation (no xformers)")
427
+
428
+ # Force use_flash_attention to False
429
+ self._attn_implementation = "eager"
430
+ if hasattr(self, 'use_flash_attention'):
431
+ self.use_flash_attention = False
432
+ if hasattr(self, 'use_flash_attention_2'):
433
+ self.use_flash_attention_2 = False
434
+
435
+ # Call original implementation with flash attention disabled
436
+ return llama_modeling._original_forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache)
437
+
438
+ # Replace the forward method
439
+ llama_modeling.LlamaAttention.forward = safe_attention_forward
440
+ logger.info("Successfully patched LLaMA attention implementation")
441
+ except Exception as e:
442
+ logger.warning(f"Failed to patch attention implementation: {e}")
443
+ logger.info("Will try to proceed with standard loading")
444
+
445
  try:
446
  logger.info(f"Attempting to load model with unsloth optimizations: {model_name}")
447
 
 
463
  max_seq_length=max_seq_length,
464
  dtype=dtype,
465
  quantization_config=bnb_config,
466
+ attn_implementation="eager", # Force eager attention
467
+ use_flash_attention=False, # Explicitly disable flash attention
468
+ use_xformers_attention=False # Explicitly disable xformers
469
  )
470
  logger.info("Model loaded successfully with unsloth")
471
 
 
477
  model.config.use_flash_attention = False
478
  if hasattr(model.config, 'use_flash_attention_2'):
479
  model.config.use_flash_attention_2 = False
480
+ if hasattr(model.config, 'use_xformers_attention'):
481
+ model.config.use_xformers_attention = False
482
 
483
  return model, tokenizer
484
 
 
487
  logger.info("Falling back to standard Hugging Face loading...")
488
 
489
  # We'll try with HF loading
490
+ attn_params = {
491
+ "attn_implementation": "eager", # Always use eager
492
+ "use_flash_attention": False, # Explicitly disable flash attention
493
+ "use_xformers_attention": False # Explicitly disable xformers
494
+ }
495
 
496
  # Approach 1: Using attn_implementation parameter (newer method)
497
  try:
 
505
  config.use_flash_attention = False
506
  if hasattr(config, 'use_flash_attention_2'):
507
  config.use_flash_attention_2 = False
508
+ if hasattr(config, 'use_xformers_attention'):
509
+ config.use_xformers_attention = False
510
 
511
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
512
 
 
537
  config.use_flash_attention = False
538
  if hasattr(config, 'use_flash_attention_2'):
539
  config.use_flash_attention_2 = False
540
+ if hasattr(config, 'use_xformers_attention'):
541
+ config.use_xformers_attention = False
542
 
543
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
544
 
 
549
  device_map="auto",
550
  torch_dtype=dtype or torch.float16,
551
  quantization_config=bnb_config,
552
+ trust_remote_code=True,
553
+ attn_implementation="eager",
554
+ use_flash_attention=False,
555
+ use_xformers_attention=False
556
  )
557
  logger.info("Model loaded successfully with basic HF loading")
558
  return model, tokenizer
 
579
  os.environ["XFORMERS_DISABLED"] = "1"
580
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
581
 
582
+ # Monkey patch torch.nn.functional to disable memory_efficient_attention
583
+ try:
584
+ import torch.nn.functional as F
585
+ if hasattr(F, 'scaled_dot_product_attention'):
586
+ logger.info("Monkey patching torch.nn.functional.scaled_dot_product_attention")
587
+ original_sdpa = F.scaled_dot_product_attention
588
+
589
+ def safe_sdpa(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None):
590
+ # Force disable memory efficient attention
591
+ logger.info("Using safe scaled_dot_product_attention (no xformers)")
592
+ return original_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale)
593
+
594
+ F.scaled_dot_product_attention = safe_sdpa
595
+ except Exception as e:
596
+ logger.warning(f"Failed to patch scaled_dot_product_attention: {e}")
597
+
598
+ # Completely remove xformers from sys.modules if it's loaded
599
+ for module_name in list(sys.modules.keys()):
600
+ if 'xformers' in module_name:
601
+ logger.info(f"Removing {module_name} from sys.modules")
602
+ del sys.modules[module_name]
603
+
604
  # Update flash attention setting to always use eager
605
  global flash_attention_available
606
  flash_attention_available = False
 
609
  # Update hardware config to ensure eager attention
610
  hardware_config["attn_implementation"] = "eager"
611
  hardware_config["use_flash_attention"] = False
612
+ hardware_config["use_xformers_attention"] = False
613
 
614
  # Verify this is training phase only
615
  training_phase_only = dataset_config.get("training_phase_only", True)