Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +45 -28
run_cloud_training.py
CHANGED
@@ -32,6 +32,9 @@ logging.basicConfig(
|
|
32 |
)
|
33 |
logger = logging.getLogger(__name__)
|
34 |
|
|
|
|
|
|
|
35 |
def load_config(config_path):
|
36 |
"""Load the transformers config from JSON file"""
|
37 |
logger.info(f"Loading config from {config_path}")
|
@@ -45,35 +48,49 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
45 |
Sort entries by prompt_number as required.
|
46 |
NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
|
47 |
"""
|
|
|
|
|
|
|
|
|
48 |
logger.info(f"Loading dataset: {dataset_name}")
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
shuffle_seed
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# Data collator for pre-tokenized dataset
|
79 |
class PreTokenizedCollator(DataCollatorMixin):
|
@@ -138,7 +155,7 @@ def remove_training_marker():
|
|
138 |
|
139 |
def train(config_path, dataset_name, output_dir):
|
140 |
"""Main training function - RESEARCH TRAINING PHASE ONLY"""
|
141 |
-
# Load environment variables
|
142 |
load_dotenv()
|
143 |
config = load_config(config_path)
|
144 |
|
@@ -170,7 +187,7 @@ def train(config_path, dataset_name, output_dir):
|
|
170 |
logger.info("RESEARCH TRAINING PHASE ACTIVE - No output generation")
|
171 |
logger.info("Configuration Summary:")
|
172 |
logger.info(f"Model: {model_config.get('model_name_or_path')}")
|
173 |
-
logger.info(f"Dataset: {dataset_name}")
|
174 |
logger.info(f"Output directory: {output_dir}")
|
175 |
logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
|
176 |
|
|
|
32 |
)
|
33 |
logger = logging.getLogger(__name__)
|
34 |
|
35 |
+
# Default dataset path - use the correct path with username
|
36 |
+
DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
|
37 |
+
|
38 |
def load_config(config_path):
|
39 |
"""Load the transformers config from JSON file"""
|
40 |
logger.info(f"Loading config from {config_path}")
|
|
|
48 |
Sort entries by prompt_number as required.
|
49 |
NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
|
50 |
"""
|
51 |
+
# Use the default dataset path if no specific path is provided
|
52 |
+
if dataset_name == "phi4-cognitive-dataset":
|
53 |
+
dataset_name = DEFAULT_DATASET
|
54 |
+
|
55 |
logger.info(f"Loading dataset: {dataset_name}")
|
56 |
|
57 |
+
try:
|
58 |
+
# Load dataset
|
59 |
+
dataset = load_dataset(dataset_name)
|
60 |
+
|
61 |
+
# Extract the split we want to use (usually 'train')
|
62 |
+
if 'train' in dataset:
|
63 |
+
dataset = dataset['train']
|
64 |
+
|
65 |
+
# Get the dataset config
|
66 |
+
dataset_config = config.get("dataset_config", {})
|
67 |
+
sort_field = dataset_config.get("sort_by_field", "prompt_number")
|
68 |
+
sort_direction = dataset_config.get("sort_direction", "ascending")
|
69 |
+
|
70 |
+
# Sort the dataset by prompt_number
|
71 |
+
logger.info(f"Sorting dataset by {sort_field} in {sort_direction} order")
|
72 |
+
if sort_direction == "ascending":
|
73 |
+
dataset = dataset.sort(sort_field)
|
74 |
+
else:
|
75 |
+
dataset = dataset.sort(sort_field, reverse=True)
|
76 |
+
|
77 |
+
# Add shuffle with fixed seed if specified
|
78 |
+
if "shuffle_seed" in dataset_config:
|
79 |
+
shuffle_seed = dataset_config.get("shuffle_seed")
|
80 |
+
logger.info(f"Shuffling dataset with seed {shuffle_seed}")
|
81 |
+
dataset = dataset.shuffle(seed=shuffle_seed)
|
82 |
+
|
83 |
+
logger.info(f"Dataset loaded with {len(dataset)} entries")
|
84 |
+
return dataset
|
85 |
|
86 |
+
except Exception as e:
|
87 |
+
logger.error(f"Error loading dataset: {str(e)}")
|
88 |
+
logger.info("Available datasets in the Hub:")
|
89 |
+
# Print a more helpful error message
|
90 |
+
print(f"Failed to load dataset: {dataset_name}")
|
91 |
+
print(f"Make sure the dataset exists and is accessible.")
|
92 |
+
print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
|
93 |
+
raise
|
94 |
|
95 |
# Data collator for pre-tokenized dataset
|
96 |
class PreTokenizedCollator(DataCollatorMixin):
|
|
|
155 |
|
156 |
def train(config_path, dataset_name, output_dir):
|
157 |
"""Main training function - RESEARCH TRAINING PHASE ONLY"""
|
158 |
+
# Load environment variables
|
159 |
load_dotenv()
|
160 |
config = load_config(config_path)
|
161 |
|
|
|
187 |
logger.info("RESEARCH TRAINING PHASE ACTIVE - No output generation")
|
188 |
logger.info("Configuration Summary:")
|
189 |
logger.info(f"Model: {model_config.get('model_name_or_path')}")
|
190 |
+
logger.info(f"Dataset: {dataset_name if dataset_name != 'phi4-cognitive-dataset' else DEFAULT_DATASET}")
|
191 |
logger.info(f"Output directory: {output_dir}")
|
192 |
logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
|
193 |
|