Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +26 -5
run_cloud_training.py
CHANGED
|
@@ -16,11 +16,14 @@ from dotenv import load_dotenv
|
|
| 16 |
import torch
|
| 17 |
from datasets import load_dataset
|
| 18 |
import transformers
|
| 19 |
-
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
|
| 20 |
from transformers.data.data_collator import DataCollatorMixin
|
| 21 |
from peft import LoraConfig
|
| 22 |
from unsloth import FastLanguageModel
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
# Check if tensorboard is available
|
| 25 |
try:
|
| 26 |
import tensorboard
|
|
@@ -263,13 +266,16 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
|
|
| 263 |
# First try the standard unsloth loading
|
| 264 |
try:
|
| 265 |
# Try loading with unsloth but without the problematic parameter
|
|
|
|
| 266 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 267 |
model_name=model_name,
|
| 268 |
max_seq_length=max_seq_length,
|
| 269 |
dtype=dtype,
|
| 270 |
load_in_4bit=True, # This should work for already quantized models
|
|
|
|
|
|
|
| 271 |
)
|
| 272 |
-
logger.info("Model loaded successfully with unsloth with 4-bit quantization")
|
| 273 |
return model, tokenizer
|
| 274 |
|
| 275 |
except TypeError as e:
|
|
@@ -283,6 +289,7 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
|
|
| 283 |
model_name=model_name,
|
| 284 |
max_seq_length=max_seq_length,
|
| 285 |
dtype=dtype,
|
|
|
|
| 286 |
)
|
| 287 |
logger.info("Model loaded successfully with unsloth using alternative method")
|
| 288 |
return model, tokenizer
|
|
@@ -295,14 +302,22 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
|
|
| 295 |
logger.warning(f"Unsloth loading failed: {e}")
|
| 296 |
logger.info("Falling back to standard Hugging Face loading...")
|
| 297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 299 |
model = AutoModelForCausalLM.from_pretrained(
|
| 300 |
model_name,
|
|
|
|
| 301 |
device_map="auto",
|
| 302 |
torch_dtype=dtype or torch.float16,
|
| 303 |
load_in_4bit=True,
|
|
|
|
| 304 |
)
|
| 305 |
-
logger.info("Model loaded successfully with standard HF loading")
|
| 306 |
return model, tokenizer
|
| 307 |
|
| 308 |
def train(config_path, dataset_name, output_dir):
|
|
@@ -318,6 +333,10 @@ def train(config_path, dataset_name, output_dir):
|
|
| 318 |
lora_config = config.get("lora_config", {})
|
| 319 |
dataset_config = config.get("dataset_config", {})
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
# Verify this is training phase only
|
| 322 |
training_phase_only = dataset_config.get("training_phase_only", True)
|
| 323 |
if not training_phase_only:
|
|
@@ -404,7 +423,7 @@ def train(config_path, dataset_name, output_dir):
|
|
| 404 |
reports = ["none"]
|
| 405 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
| 406 |
|
| 407 |
-
# Set up training arguments
|
| 408 |
training_args = TrainingArguments(
|
| 409 |
output_dir=output_dir,
|
| 410 |
num_train_epochs=training_config.get("num_train_epochs", 3),
|
|
@@ -425,7 +444,9 @@ def train(config_path, dataset_name, output_dir):
|
|
| 425 |
logging_first_step=training_config.get("logging_first_step", True),
|
| 426 |
disable_tqdm=training_config.get("disable_tqdm", False),
|
| 427 |
# Important: Don't remove columns that don't match model's forward method
|
| 428 |
-
remove_unused_columns=False
|
|
|
|
|
|
|
| 429 |
)
|
| 430 |
|
| 431 |
# Create trainer with pre-tokenized collator
|
|
|
|
| 16 |
import torch
|
| 17 |
from datasets import load_dataset
|
| 18 |
import transformers
|
| 19 |
+
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, AutoConfig
|
| 20 |
from transformers.data.data_collator import DataCollatorMixin
|
| 21 |
from peft import LoraConfig
|
| 22 |
from unsloth import FastLanguageModel
|
| 23 |
|
| 24 |
+
# Disable flash attention globally
|
| 25 |
+
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
| 26 |
+
|
| 27 |
# Check if tensorboard is available
|
| 28 |
try:
|
| 29 |
import tensorboard
|
|
|
|
| 266 |
# First try the standard unsloth loading
|
| 267 |
try:
|
| 268 |
# Try loading with unsloth but without the problematic parameter
|
| 269 |
+
logger.info("Loading model with flash attention DISABLED")
|
| 270 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 271 |
model_name=model_name,
|
| 272 |
max_seq_length=max_seq_length,
|
| 273 |
dtype=dtype,
|
| 274 |
load_in_4bit=True, # This should work for already quantized models
|
| 275 |
+
use_flash_attention=False, # Explicitly disable flash attention
|
| 276 |
+
attn_implementation="eager" # Use eager implementation instead
|
| 277 |
)
|
| 278 |
+
logger.info("Model loaded successfully with unsloth with 4-bit quantization and flash attention disabled")
|
| 279 |
return model, tokenizer
|
| 280 |
|
| 281 |
except TypeError as e:
|
|
|
|
| 289 |
model_name=model_name,
|
| 290 |
max_seq_length=max_seq_length,
|
| 291 |
dtype=dtype,
|
| 292 |
+
use_flash_attention=False, # Explicitly disable flash attention
|
| 293 |
)
|
| 294 |
logger.info("Model loaded successfully with unsloth using alternative method")
|
| 295 |
return model, tokenizer
|
|
|
|
| 302 |
logger.warning(f"Unsloth loading failed: {e}")
|
| 303 |
logger.info("Falling back to standard Hugging Face loading...")
|
| 304 |
|
| 305 |
+
# Disable flash attention in transformers config
|
| 306 |
+
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
| 307 |
+
if hasattr(config, "use_flash_attention"):
|
| 308 |
+
config.use_flash_attention = False
|
| 309 |
+
logger.info("Disabled flash attention in model config")
|
| 310 |
+
|
| 311 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 312 |
model = AutoModelForCausalLM.from_pretrained(
|
| 313 |
model_name,
|
| 314 |
+
config=config,
|
| 315 |
device_map="auto",
|
| 316 |
torch_dtype=dtype or torch.float16,
|
| 317 |
load_in_4bit=True,
|
| 318 |
+
attn_implementation="eager" # Use eager implementation instead of flash attention
|
| 319 |
)
|
| 320 |
+
logger.info("Model loaded successfully with standard HF loading and flash attention disabled")
|
| 321 |
return model, tokenizer
|
| 322 |
|
| 323 |
def train(config_path, dataset_name, output_dir):
|
|
|
|
| 333 |
lora_config = config.get("lora_config", {})
|
| 334 |
dataset_config = config.get("dataset_config", {})
|
| 335 |
|
| 336 |
+
# Override flash attention setting to disable it
|
| 337 |
+
hardware_config["use_flash_attention"] = False
|
| 338 |
+
logger.info("Flash attention has been DISABLED due to GPU compatibility issues")
|
| 339 |
+
|
| 340 |
# Verify this is training phase only
|
| 341 |
training_phase_only = dataset_config.get("training_phase_only", True)
|
| 342 |
if not training_phase_only:
|
|
|
|
| 423 |
reports = ["none"]
|
| 424 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
| 425 |
|
| 426 |
+
# Set up training arguments with flash attention disabled
|
| 427 |
training_args = TrainingArguments(
|
| 428 |
output_dir=output_dir,
|
| 429 |
num_train_epochs=training_config.get("num_train_epochs", 3),
|
|
|
|
| 444 |
logging_first_step=training_config.get("logging_first_step", True),
|
| 445 |
disable_tqdm=training_config.get("disable_tqdm", False),
|
| 446 |
# Important: Don't remove columns that don't match model's forward method
|
| 447 |
+
remove_unused_columns=False,
|
| 448 |
+
# Disable flash attention
|
| 449 |
+
attn_implementation="eager"
|
| 450 |
)
|
| 451 |
|
| 452 |
# Create trainer with pre-tokenized collator
|