George-API commited on
Commit
c7c538f
·
verified ·
1 Parent(s): 9dfc456

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +306 -0
run_cloud_training.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-bnb-4bit using unsloth
6
+ RESEARCH TRAINING PHASE ONLY - No output generation
7
+ WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
8
+ """
9
+
10
+ import os
11
+ import json
12
+ import logging
13
+ import argparse
14
+ import numpy as np
15
+ from dotenv import load_dotenv
16
+ import torch
17
+ from datasets import load_dataset
18
+ import transformers
19
+ from transformers import AutoTokenizer, TrainingArguments, Trainer
20
+ from transformers.data.data_collator import DataCollatorMixin
21
+ from peft import LoraConfig
22
+ from unsloth import FastLanguageModel
23
+
24
+ # Configure logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
28
+ handlers=[
29
+ logging.StreamHandler(),
30
+ logging.FileHandler("training.log")
31
+ ]
32
+ )
33
+ logger = logging.getLogger(__name__)
34
+
35
+ def load_config(config_path):
36
+ """Load the transformers config from JSON file"""
37
+ logger.info(f"Loading config from {config_path}")
38
+ with open(config_path, 'r') as f:
39
+ config = json.load(f)
40
+ return config
41
+
42
+ def load_and_prepare_dataset(dataset_name, config):
43
+ """
44
+ Load and prepare the dataset for fine-tuning.
45
+ Sort entries by prompt_number as required.
46
+ NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
47
+ """
48
+ logger.info(f"Loading dataset: {dataset_name}")
49
+
50
+ # Load dataset
51
+ dataset = load_dataset(dataset_name)
52
+
53
+ # Extract the split we want to use (usually 'train')
54
+ if 'train' in dataset:
55
+ dataset = dataset['train']
56
+
57
+ # Get the dataset config
58
+ dataset_config = config.get("dataset_config", {})
59
+ sort_field = dataset_config.get("sort_by_field", "prompt_number")
60
+ sort_direction = dataset_config.get("sort_direction", "ascending")
61
+
62
+ # Sort the dataset by prompt_number
63
+ logger.info(f"Sorting dataset by {sort_field} in {sort_direction} order")
64
+ if sort_direction == "ascending":
65
+ dataset = dataset.sort(sort_field)
66
+ else:
67
+ dataset = dataset.sort(sort_field, reverse=True)
68
+
69
+ # Add shuffle with fixed seed if specified
70
+ if "shuffle_seed" in dataset_config:
71
+ shuffle_seed = dataset_config.get("shuffle_seed")
72
+ logger.info(f"Shuffling dataset with seed {shuffle_seed}")
73
+ dataset = dataset.shuffle(seed=shuffle_seed)
74
+
75
+ logger.info(f"Dataset loaded with {len(dataset)} entries")
76
+ return dataset
77
+
78
+ # Data collator for pre-tokenized dataset
79
+ class PreTokenizedCollator(DataCollatorMixin):
80
+ """
81
+ Data collator for pre-tokenized datasets.
82
+ Expects input_ids and labels already tokenized.
83
+ """
84
+ def __init__(self, pad_token_id=0):
85
+ self.pad_token_id = pad_token_id
86
+
87
+ def __call__(self, features):
88
+ # Determine max length in this batch
89
+ batch_max_len = max(len(x["input_ids"]) for x in features)
90
+
91
+ # Initialize batch tensors
92
+ batch = {
93
+ "input_ids": torch.ones((len(features), batch_max_len), dtype=torch.long) * self.pad_token_id,
94
+ "attention_mask": torch.zeros((len(features), batch_max_len), dtype=torch.long),
95
+ "labels": torch.ones((len(features), batch_max_len), dtype=torch.long) * -100 # -100 is ignored in loss
96
+ }
97
+
98
+ # Fill batch tensors
99
+ for i, feature in enumerate(features):
100
+ input_ids = feature["input_ids"]
101
+ seq_len = len(input_ids)
102
+
103
+ # Convert to tensor if it's a list
104
+ if isinstance(input_ids, list):
105
+ input_ids = torch.tensor(input_ids, dtype=torch.long)
106
+
107
+ # Copy data to batch tensors
108
+ batch["input_ids"][i, :seq_len] = input_ids
109
+ batch["attention_mask"][i, :seq_len] = 1
110
+
111
+ # If there are labels, use them, otherwise use input_ids
112
+ if "labels" in feature:
113
+ labels = feature["labels"]
114
+ if isinstance(labels, list):
115
+ labels = torch.tensor(labels, dtype=torch.long)
116
+ batch["labels"][i, :len(labels)] = labels
117
+ else:
118
+ batch["labels"][i, :seq_len] = input_ids
119
+
120
+ return batch
121
+
122
+ def create_training_marker(output_dir):
123
+ """Create a marker file to indicate training is active"""
124
+ # Create in current directory for app.py to find
125
+ with open("TRAINING_ACTIVE", "w") as f:
126
+ f.write(f"Training active in {output_dir}")
127
+
128
+ # Also create in output directory
129
+ os.makedirs(output_dir, exist_ok=True)
130
+ with open(os.path.join(output_dir, "RESEARCH_TRAINING_ONLY"), "w") as f:
131
+ f.write("This model is for research training only. No interactive outputs.")
132
+
133
+ def remove_training_marker():
134
+ """Remove the training marker file"""
135
+ if os.path.exists("TRAINING_ACTIVE"):
136
+ os.remove("TRAINING_ACTIVE")
137
+ logger.info("Removed training active marker")
138
+
139
+ def train(config_path, dataset_name, output_dir):
140
+ """Main training function - RESEARCH TRAINING PHASE ONLY"""
141
+ # Load environment variables and configuration
142
+ load_dotenv()
143
+ config = load_config(config_path)
144
+
145
+ # Extract configs
146
+ model_config = config.get("model_config", {})
147
+ training_config = config.get("training_config", {})
148
+ hardware_config = config.get("hardware_config", {})
149
+ lora_config = config.get("lora_config", {})
150
+ dataset_config = config.get("dataset_config", {})
151
+
152
+ # Verify this is training phase only
153
+ training_phase_only = dataset_config.get("training_phase_only", True)
154
+ if not training_phase_only:
155
+ logger.warning("This script is meant for research training phase only")
156
+ logger.warning("Setting training_phase_only=True")
157
+
158
+ # Verify dataset is pre-tokenized
159
+ logger.info("IMPORTANT: Using pre-tokenized dataset - No tokenization will be performed")
160
+
161
+ # Set the output directory
162
+ output_dir = output_dir or training_config.get("output_dir", "fine_tuned_model")
163
+ os.makedirs(output_dir, exist_ok=True)
164
+
165
+ # Create training marker
166
+ create_training_marker(output_dir)
167
+
168
+ try:
169
+ # Print configuration summary
170
+ logger.info("RESEARCH TRAINING PHASE ACTIVE - No output generation")
171
+ logger.info("Configuration Summary:")
172
+ logger.info(f"Model: {model_config.get('model_name_or_path')}")
173
+ logger.info(f"Dataset: {dataset_name}")
174
+ logger.info(f"Output directory: {output_dir}")
175
+ logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
176
+
177
+ # Load and prepare the dataset
178
+ dataset = load_and_prepare_dataset(dataset_name, config)
179
+
180
+ # Initialize tokenizer (just for model initialization, not for tokenizing data)
181
+ logger.info("Loading tokenizer (for model initialization only, not for tokenizing data)")
182
+ tokenizer = AutoTokenizer.from_pretrained(
183
+ model_config.get("model_name_or_path"),
184
+ trust_remote_code=True
185
+ )
186
+ tokenizer.pad_token = tokenizer.eos_token
187
+
188
+ # Initialize model with unsloth
189
+ logger.info("Initializing model with unsloth (preserving 4-bit quantization)")
190
+ max_seq_length = training_config.get("max_seq_length", 2048)
191
+
192
+ # Create LoRA config
193
+ peft_config = LoraConfig(
194
+ r=lora_config.get("r", 16),
195
+ lora_alpha=lora_config.get("lora_alpha", 32),
196
+ lora_dropout=lora_config.get("lora_dropout", 0.05),
197
+ bias=lora_config.get("bias", "none"),
198
+ target_modules=lora_config.get("target_modules", ["q_proj", "k_proj", "v_proj", "o_proj"])
199
+ )
200
+
201
+ # Initialize model with unsloth, preserving existing 4-bit quantization
202
+ logger.info("Loading pre-quantized model with unsloth")
203
+ model, tokenizer = FastLanguageModel.from_pretrained(
204
+ model_name=model_config.get("model_name_or_path"),
205
+ max_seq_length=max_seq_length,
206
+ dtype=torch.float16 if hardware_config.get("fp16", True) else None,
207
+ load_in_4bit=False, # Don't re-quantize, model is already 4-bit
208
+ use_existing_bnb_quantization=True # Use the existing quantization
209
+ )
210
+ model = FastLanguageModel.get_peft_model(
211
+ model,
212
+ peft_config=peft_config,
213
+ tokenizer=tokenizer,
214
+ use_gradient_checkpointing=hardware_config.get("gradient_checkpointing", True)
215
+ )
216
+
217
+ # No need to format the dataset - it's already pre-tokenized
218
+ logger.info("Using pre-tokenized dataset - skipping tokenization step")
219
+ training_dataset = dataset
220
+
221
+ # Configure wandb if API key is available
222
+ reports = ["tensorboard"]
223
+ if os.getenv("WANDB_API_KEY"):
224
+ reports.append("wandb")
225
+ logger.info("Wandb API key found, enabling wandb reporting")
226
+ else:
227
+ logger.info("No Wandb API key found, using tensorboard only")
228
+
229
+ # Set up training arguments
230
+ training_args = TrainingArguments(
231
+ output_dir=output_dir,
232
+ num_train_epochs=training_config.get("num_train_epochs", 3),
233
+ per_device_train_batch_size=training_config.get("per_device_train_batch_size", 2),
234
+ gradient_accumulation_steps=training_config.get("gradient_accumulation_steps", 4),
235
+ learning_rate=training_config.get("learning_rate", 2e-5),
236
+ lr_scheduler_type=training_config.get("lr_scheduler_type", "cosine"),
237
+ warmup_ratio=training_config.get("warmup_ratio", 0.03),
238
+ weight_decay=training_config.get("weight_decay", 0.01),
239
+ optim=training_config.get("optim", "adamw_torch"),
240
+ logging_steps=training_config.get("logging_steps", 10),
241
+ save_steps=training_config.get("save_steps", 200),
242
+ save_total_limit=training_config.get("save_total_limit", 3),
243
+ fp16=hardware_config.get("fp16", True),
244
+ bf16=hardware_config.get("bf16", False),
245
+ max_grad_norm=training_config.get("max_grad_norm", 0.3),
246
+ report_to=reports,
247
+ logging_first_step=training_config.get("logging_first_step", True),
248
+ disable_tqdm=training_config.get("disable_tqdm", False)
249
+ )
250
+
251
+ # Create trainer with pre-tokenized collator
252
+ trainer = Trainer(
253
+ model=model,
254
+ args=training_args,
255
+ train_dataset=training_dataset,
256
+ data_collator=PreTokenizedCollator(pad_token_id=tokenizer.pad_token_id),
257
+ )
258
+
259
+ # Start training
260
+ logger.info("Starting training - RESEARCH PHASE ONLY")
261
+ trainer.train()
262
+
263
+ # Save the model
264
+ logger.info(f"Saving model to {output_dir}")
265
+ trainer.save_model(output_dir)
266
+
267
+ # Save LoRA adapter separately for easier deployment
268
+ lora_output_dir = os.path.join(output_dir, "lora_adapter")
269
+ model.save_pretrained(lora_output_dir)
270
+ logger.info(f"Saved LoRA adapter to {lora_output_dir}")
271
+
272
+ # Save tokenizer for completeness
273
+ tokenizer_output_dir = os.path.join(output_dir, "tokenizer")
274
+ tokenizer.save_pretrained(tokenizer_output_dir)
275
+ logger.info(f"Saved tokenizer to {tokenizer_output_dir}")
276
+
277
+ # Copy config file for reference
278
+ with open(os.path.join(output_dir, "training_config.json"), "w") as f:
279
+ json.dump(config, f, indent=2)
280
+
281
+ logger.info("Training complete - RESEARCH PHASE ONLY")
282
+ return output_dir
283
+
284
+ finally:
285
+ # Always remove the training marker when done
286
+ remove_training_marker()
287
+
288
+ if __name__ == "__main__":
289
+ parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-4bit model (RESEARCH ONLY)")
290
+ parser.add_argument("--config", type=str, default="transformers_config.json",
291
+ help="Path to the transformers config JSON file")
292
+ parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",
293
+ help="Dataset name or path")
294
+ parser.add_argument("--output_dir", type=str, default=None,
295
+ help="Output directory for the fine-tuned model")
296
+
297
+ args = parser.parse_args()
298
+
299
+ # Run training - Research phase only
300
+ try:
301
+ output_path = train(args.config, args.dataset, args.output_dir)
302
+ print(f"Research training completed. Model saved to: {output_path}")
303
+ except Exception as e:
304
+ logger.error(f"Training failed: {str(e)}")
305
+ remove_training_marker() # Clean up marker if training fails
306
+ raise