Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +26 -5
run_cloud_training.py
CHANGED
@@ -16,11 +16,14 @@ from dotenv import load_dotenv
|
|
16 |
import torch
|
17 |
from datasets import load_dataset
|
18 |
import transformers
|
19 |
-
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
|
20 |
from transformers.data.data_collator import DataCollatorMixin
|
21 |
from peft import LoraConfig
|
22 |
from unsloth import FastLanguageModel
|
23 |
|
|
|
|
|
|
|
24 |
# Check if tensorboard is available
|
25 |
try:
|
26 |
import tensorboard
|
@@ -263,13 +266,16 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
|
|
263 |
# First try the standard unsloth loading
|
264 |
try:
|
265 |
# Try loading with unsloth but without the problematic parameter
|
|
|
266 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
267 |
model_name=model_name,
|
268 |
max_seq_length=max_seq_length,
|
269 |
dtype=dtype,
|
270 |
load_in_4bit=True, # This should work for already quantized models
|
|
|
|
|
271 |
)
|
272 |
-
logger.info("Model loaded successfully with unsloth with 4-bit quantization")
|
273 |
return model, tokenizer
|
274 |
|
275 |
except TypeError as e:
|
@@ -283,6 +289,7 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
|
|
283 |
model_name=model_name,
|
284 |
max_seq_length=max_seq_length,
|
285 |
dtype=dtype,
|
|
|
286 |
)
|
287 |
logger.info("Model loaded successfully with unsloth using alternative method")
|
288 |
return model, tokenizer
|
@@ -295,14 +302,22 @@ def load_model_safely(model_name, max_seq_length, dtype=None):
|
|
295 |
logger.warning(f"Unsloth loading failed: {e}")
|
296 |
logger.info("Falling back to standard Hugging Face loading...")
|
297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
299 |
model = AutoModelForCausalLM.from_pretrained(
|
300 |
model_name,
|
|
|
301 |
device_map="auto",
|
302 |
torch_dtype=dtype or torch.float16,
|
303 |
load_in_4bit=True,
|
|
|
304 |
)
|
305 |
-
logger.info("Model loaded successfully with standard HF loading")
|
306 |
return model, tokenizer
|
307 |
|
308 |
def train(config_path, dataset_name, output_dir):
|
@@ -318,6 +333,10 @@ def train(config_path, dataset_name, output_dir):
|
|
318 |
lora_config = config.get("lora_config", {})
|
319 |
dataset_config = config.get("dataset_config", {})
|
320 |
|
|
|
|
|
|
|
|
|
321 |
# Verify this is training phase only
|
322 |
training_phase_only = dataset_config.get("training_phase_only", True)
|
323 |
if not training_phase_only:
|
@@ -404,7 +423,7 @@ def train(config_path, dataset_name, output_dir):
|
|
404 |
reports = ["none"]
|
405 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
406 |
|
407 |
-
# Set up training arguments
|
408 |
training_args = TrainingArguments(
|
409 |
output_dir=output_dir,
|
410 |
num_train_epochs=training_config.get("num_train_epochs", 3),
|
@@ -425,7 +444,9 @@ def train(config_path, dataset_name, output_dir):
|
|
425 |
logging_first_step=training_config.get("logging_first_step", True),
|
426 |
disable_tqdm=training_config.get("disable_tqdm", False),
|
427 |
# Important: Don't remove columns that don't match model's forward method
|
428 |
-
remove_unused_columns=False
|
|
|
|
|
429 |
)
|
430 |
|
431 |
# Create trainer with pre-tokenized collator
|
|
|
16 |
import torch
|
17 |
from datasets import load_dataset
|
18 |
import transformers
|
19 |
+
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, AutoConfig
|
20 |
from transformers.data.data_collator import DataCollatorMixin
|
21 |
from peft import LoraConfig
|
22 |
from unsloth import FastLanguageModel
|
23 |
|
24 |
+
# Disable flash attention globally
|
25 |
+
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
26 |
+
|
27 |
# Check if tensorboard is available
|
28 |
try:
|
29 |
import tensorboard
|
|
|
266 |
# First try the standard unsloth loading
|
267 |
try:
|
268 |
# Try loading with unsloth but without the problematic parameter
|
269 |
+
logger.info("Loading model with flash attention DISABLED")
|
270 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
271 |
model_name=model_name,
|
272 |
max_seq_length=max_seq_length,
|
273 |
dtype=dtype,
|
274 |
load_in_4bit=True, # This should work for already quantized models
|
275 |
+
use_flash_attention=False, # Explicitly disable flash attention
|
276 |
+
attn_implementation="eager" # Use eager implementation instead
|
277 |
)
|
278 |
+
logger.info("Model loaded successfully with unsloth with 4-bit quantization and flash attention disabled")
|
279 |
return model, tokenizer
|
280 |
|
281 |
except TypeError as e:
|
|
|
289 |
model_name=model_name,
|
290 |
max_seq_length=max_seq_length,
|
291 |
dtype=dtype,
|
292 |
+
use_flash_attention=False, # Explicitly disable flash attention
|
293 |
)
|
294 |
logger.info("Model loaded successfully with unsloth using alternative method")
|
295 |
return model, tokenizer
|
|
|
302 |
logger.warning(f"Unsloth loading failed: {e}")
|
303 |
logger.info("Falling back to standard Hugging Face loading...")
|
304 |
|
305 |
+
# Disable flash attention in transformers config
|
306 |
+
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
307 |
+
if hasattr(config, "use_flash_attention"):
|
308 |
+
config.use_flash_attention = False
|
309 |
+
logger.info("Disabled flash attention in model config")
|
310 |
+
|
311 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
312 |
model = AutoModelForCausalLM.from_pretrained(
|
313 |
model_name,
|
314 |
+
config=config,
|
315 |
device_map="auto",
|
316 |
torch_dtype=dtype or torch.float16,
|
317 |
load_in_4bit=True,
|
318 |
+
attn_implementation="eager" # Use eager implementation instead of flash attention
|
319 |
)
|
320 |
+
logger.info("Model loaded successfully with standard HF loading and flash attention disabled")
|
321 |
return model, tokenizer
|
322 |
|
323 |
def train(config_path, dataset_name, output_dir):
|
|
|
333 |
lora_config = config.get("lora_config", {})
|
334 |
dataset_config = config.get("dataset_config", {})
|
335 |
|
336 |
+
# Override flash attention setting to disable it
|
337 |
+
hardware_config["use_flash_attention"] = False
|
338 |
+
logger.info("Flash attention has been DISABLED due to GPU compatibility issues")
|
339 |
+
|
340 |
# Verify this is training phase only
|
341 |
training_phase_only = dataset_config.get("training_phase_only", True)
|
342 |
if not training_phase_only:
|
|
|
423 |
reports = ["none"]
|
424 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
425 |
|
426 |
+
# Set up training arguments with flash attention disabled
|
427 |
training_args = TrainingArguments(
|
428 |
output_dir=output_dir,
|
429 |
num_train_epochs=training_config.get("num_train_epochs", 3),
|
|
|
444 |
logging_first_step=training_config.get("logging_first_step", True),
|
445 |
disable_tqdm=training_config.get("disable_tqdm", False),
|
446 |
# Important: Don't remove columns that don't match model's forward method
|
447 |
+
remove_unused_columns=False,
|
448 |
+
# Disable flash attention
|
449 |
+
attn_implementation="eager"
|
450 |
)
|
451 |
|
452 |
# Create trainer with pre-tokenized collator
|