George-API commited on
Commit
f1e4d0b
·
verified ·
1 Parent(s): c7c538f

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +45 -28
run_cloud_training.py CHANGED
@@ -32,6 +32,9 @@ logging.basicConfig(
32
  )
33
  logger = logging.getLogger(__name__)
34
 
 
 
 
35
  def load_config(config_path):
36
  """Load the transformers config from JSON file"""
37
  logger.info(f"Loading config from {config_path}")
@@ -45,35 +48,49 @@ def load_and_prepare_dataset(dataset_name, config):
45
  Sort entries by prompt_number as required.
46
  NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
47
  """
 
 
 
 
48
  logger.info(f"Loading dataset: {dataset_name}")
49
 
50
- # Load dataset
51
- dataset = load_dataset(dataset_name)
52
-
53
- # Extract the split we want to use (usually 'train')
54
- if 'train' in dataset:
55
- dataset = dataset['train']
56
-
57
- # Get the dataset config
58
- dataset_config = config.get("dataset_config", {})
59
- sort_field = dataset_config.get("sort_by_field", "prompt_number")
60
- sort_direction = dataset_config.get("sort_direction", "ascending")
61
-
62
- # Sort the dataset by prompt_number
63
- logger.info(f"Sorting dataset by {sort_field} in {sort_direction} order")
64
- if sort_direction == "ascending":
65
- dataset = dataset.sort(sort_field)
66
- else:
67
- dataset = dataset.sort(sort_field, reverse=True)
68
-
69
- # Add shuffle with fixed seed if specified
70
- if "shuffle_seed" in dataset_config:
71
- shuffle_seed = dataset_config.get("shuffle_seed")
72
- logger.info(f"Shuffling dataset with seed {shuffle_seed}")
73
- dataset = dataset.shuffle(seed=shuffle_seed)
 
 
 
 
74
 
75
- logger.info(f"Dataset loaded with {len(dataset)} entries")
76
- return dataset
 
 
 
 
 
 
77
 
78
  # Data collator for pre-tokenized dataset
79
  class PreTokenizedCollator(DataCollatorMixin):
@@ -138,7 +155,7 @@ def remove_training_marker():
138
 
139
  def train(config_path, dataset_name, output_dir):
140
  """Main training function - RESEARCH TRAINING PHASE ONLY"""
141
- # Load environment variables and configuration
142
  load_dotenv()
143
  config = load_config(config_path)
144
 
@@ -170,7 +187,7 @@ def train(config_path, dataset_name, output_dir):
170
  logger.info("RESEARCH TRAINING PHASE ACTIVE - No output generation")
171
  logger.info("Configuration Summary:")
172
  logger.info(f"Model: {model_config.get('model_name_or_path')}")
173
- logger.info(f"Dataset: {dataset_name}")
174
  logger.info(f"Output directory: {output_dir}")
175
  logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
176
 
 
32
  )
33
  logger = logging.getLogger(__name__)
34
 
35
+ # Default dataset path - use the correct path with username
36
+ DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
37
+
38
  def load_config(config_path):
39
  """Load the transformers config from JSON file"""
40
  logger.info(f"Loading config from {config_path}")
 
48
  Sort entries by prompt_number as required.
49
  NO TOKENIZATION - DATASET IS ALREADY TOKENIZED
50
  """
51
+ # Use the default dataset path if no specific path is provided
52
+ if dataset_name == "phi4-cognitive-dataset":
53
+ dataset_name = DEFAULT_DATASET
54
+
55
  logger.info(f"Loading dataset: {dataset_name}")
56
 
57
+ try:
58
+ # Load dataset
59
+ dataset = load_dataset(dataset_name)
60
+
61
+ # Extract the split we want to use (usually 'train')
62
+ if 'train' in dataset:
63
+ dataset = dataset['train']
64
+
65
+ # Get the dataset config
66
+ dataset_config = config.get("dataset_config", {})
67
+ sort_field = dataset_config.get("sort_by_field", "prompt_number")
68
+ sort_direction = dataset_config.get("sort_direction", "ascending")
69
+
70
+ # Sort the dataset by prompt_number
71
+ logger.info(f"Sorting dataset by {sort_field} in {sort_direction} order")
72
+ if sort_direction == "ascending":
73
+ dataset = dataset.sort(sort_field)
74
+ else:
75
+ dataset = dataset.sort(sort_field, reverse=True)
76
+
77
+ # Add shuffle with fixed seed if specified
78
+ if "shuffle_seed" in dataset_config:
79
+ shuffle_seed = dataset_config.get("shuffle_seed")
80
+ logger.info(f"Shuffling dataset with seed {shuffle_seed}")
81
+ dataset = dataset.shuffle(seed=shuffle_seed)
82
+
83
+ logger.info(f"Dataset loaded with {len(dataset)} entries")
84
+ return dataset
85
 
86
+ except Exception as e:
87
+ logger.error(f"Error loading dataset: {str(e)}")
88
+ logger.info("Available datasets in the Hub:")
89
+ # Print a more helpful error message
90
+ print(f"Failed to load dataset: {dataset_name}")
91
+ print(f"Make sure the dataset exists and is accessible.")
92
+ print(f"If it's a private dataset, ensure your HF_TOKEN has access to it.")
93
+ raise
94
 
95
  # Data collator for pre-tokenized dataset
96
  class PreTokenizedCollator(DataCollatorMixin):
 
155
 
156
  def train(config_path, dataset_name, output_dir):
157
  """Main training function - RESEARCH TRAINING PHASE ONLY"""
158
+ # Load environment variables
159
  load_dotenv()
160
  config = load_config(config_path)
161
 
 
187
  logger.info("RESEARCH TRAINING PHASE ACTIVE - No output generation")
188
  logger.info("Configuration Summary:")
189
  logger.info(f"Model: {model_config.get('model_name_or_path')}")
190
+ logger.info(f"Dataset: {dataset_name if dataset_name != 'phi4-cognitive-dataset' else DEFAULT_DATASET}")
191
  logger.info(f"Output directory: {output_dir}")
192
  logger.info("IMPORTANT: Using already 4-bit quantized model - not re-quantizing")
193