pathii commited on
Commit
60c3115
·
verified ·
1 Parent(s): 276f253

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +35 -26
train.py CHANGED
@@ -1,19 +1,40 @@
 
1
  from unsloth import FastLanguageModel
2
  from transformers import TrainingArguments, Trainer
 
 
 
 
 
 
 
3
 
4
  # Load quantized model
5
- model, tokenizer = FastLanguageModel.from_pretrained(
6
- model_name="deepseek-ai/DeepSeek-V3-0324",
7
- dtype=torch.bfloat16,
8
- load_in_4bit=True, # Or 2.71-bit
9
- token=os.environ["HF_TOKEN"]
10
- )
11
- FastLanguageModel.for_training(model)
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Training arguments
14
  training_args = TrainingArguments(
15
  output_dir="/app/checkpoints",
16
- per_device_train_batch_size=4, # Adjust for A100 40GB/80GB
17
  per_device_eval_batch_size=4,
18
  num_train_epochs=2,
19
  learning_rate=2e-5,
@@ -23,26 +44,11 @@ training_args = TrainingArguments(
23
  eval_steps=500,
24
  logging_dir="/app/logs",
25
  logging_steps=100,
26
- fp16=False, # bfloat16 for A100
 
27
  deepspeed="/app/ds_config.json"
28
  )
29
 
30
- # DeepSpeed config
31
- with open("/app/ds_config.json", "w") as f:
32
- f.write('''
33
- {
34
- "fp16": {"enabled": false},
35
- "bf16": {"enabled": true},
36
- "zero_optimization": {
37
- "stage": 3,
38
- "offload_optimizer": {"device": "cpu"},
39
- "offload_param": {"device": "cpu"}
40
- },
41
- "train_batch_size": "auto",
42
- "gradient_accumulation_steps": 4
43
- }
44
- ''')
45
-
46
  # Initialize trainer
47
  trainer = Trainer(
48
  model=model,
@@ -52,7 +58,10 @@ trainer = Trainer(
52
  )
53
 
54
  # Train
55
- trainer.train()
 
 
 
56
 
57
  # Save model
58
  model.save_pretrained("/app/fine_tuned_model")
 
1
+ import os
2
  from unsloth import FastLanguageModel
3
  from transformers import TrainingArguments, Trainer
4
+ from datasets import load_dataset
5
+ import torch
6
+
7
+ # Validate environment variable
8
+ HF_TOKEN = os.getenv("HF_TOKEN")
9
+ if not HF_TOKEN:
10
+ raise ValueError("HF_TOKEN environment variable not set")
11
 
12
  # Load quantized model
13
+ try:
14
+ model, tokenizer = FastLanguageModel.from_pretrained(
15
+ model_name="deepseek-ai/DeepSeek-V3",
16
+ dtype=torch.bfloat16,
17
+ load_in_4bit=True,
18
+ token=HF_TOKEN
19
+ )
20
+ FastLanguageModel.for_training(model)
21
+ except Exception as e:
22
+ raise RuntimeError(f"Failed to load model: {str(e)}")
23
+
24
+ # Load and prepare dataset (example - replace with your actual dataset)
25
+ try:
26
+ dataset = load_dataset("imdb") # Example dataset
27
+ tokenized_dataset = dataset.map(
28
+ lambda x: tokenizer(x["text"], truncation=True, padding="max_length"),
29
+ batched=True
30
+ )
31
+ except Exception as e:
32
+ raise RuntimeError(f"Failed to load/prepare dataset: {str(e)}")
33
 
34
  # Training arguments
35
  training_args = TrainingArguments(
36
  output_dir="/app/checkpoints",
37
+ per_device_train_batch_size=4,
38
  per_device_eval_batch_size=4,
39
  num_train_epochs=2,
40
  learning_rate=2e-5,
 
44
  eval_steps=500,
45
  logging_dir="/app/logs",
46
  logging_steps=100,
47
+ fp16=False,
48
+ bf16=True,
49
  deepspeed="/app/ds_config.json"
50
  )
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Initialize trainer
53
  trainer = Trainer(
54
  model=model,
 
58
  )
59
 
60
  # Train
61
+ try:
62
+ trainer.train()
63
+ except Exception as e:
64
+ raise RuntimeError(f"Training failed: {str(e)}")
65
 
66
  # Save model
67
  model.save_pretrained("/app/fine_tuned_model")