George-API commited on
Commit
3da7418
·
verified ·
1 Parent(s): 1d4c4c4

Upload folder using huggingface_hub

Browse files
dataset_config.json CHANGED
@@ -26,11 +26,12 @@
26
  }
27
  },
28
  "data_loading": {
29
- "batch_size": 16,
30
  "shuffle": false,
31
  "drop_last": false,
32
- "num_workers": 2,
33
- "pin_memory": false
 
34
  },
35
  "validation": {
36
  "log_samples": 3,
 
26
  }
27
  },
28
  "data_loading": {
29
+ "batch_size": 24,
30
  "shuffle": false,
31
  "drop_last": false,
32
+ "num_workers": 8,
33
+ "pin_memory": true,
34
+ "prefetch_factor": 4
35
  },
36
  "validation": {
37
  "log_samples": 3,
hardware_config.json CHANGED
@@ -1,42 +1,49 @@
1
  {
2
- "hardware_name": "2xA10G",
3
  "specs": {
4
- "gpu_count": 2,
5
- "gpu_type": "A10G",
6
  "vram_per_gpu": 24,
7
- "total_vram": 48,
8
- "vcpu_count": 24,
9
- "ram": 92
10
  },
11
  "training_optimizations": {
12
- "per_device_batch_size": 16,
13
- "gradient_accumulation_steps": 4,
14
- "effective_batch_size": 128,
15
  "memory_optimizations": {
16
  "use_gradient_checkpointing": true,
17
  "pin_memory": true,
18
- "num_workers": 2
 
19
  },
20
  "distributed_settings": {
21
  "device_map": "auto",
22
- "ddp_find_unused_parameters": false
 
 
 
 
 
 
23
  }
24
  },
25
  "memory_breakdown": {
26
  "model_size": "~3.5GB (pre-quantized 4-bit)",
27
  "optimizer_states": "~1GB",
28
- "batch_memory_per_gpu": "~2GB",
29
- "peak_memory_estimate": "18-20GB",
30
- "safe_headroom": "4-6GB"
31
  },
32
- "compute_environment": "A10G_CLOUD",
33
- "distributed_type": "DATA_PARALLEL",
34
  "mixed_precision": "bf16",
35
- "num_gpus": 2,
36
  "training_parameters": {
37
- "per_device_train_batch_size": 16,
38
- "gradient_accumulation_steps": 4,
39
- "dataloader_num_workers": 2,
40
  "dataloader_pin_memory": true,
41
  "gradient_checkpointing": true,
42
  "max_grad_norm": 1.0
 
1
  {
2
+ "hardware_name": "4xL4",
3
  "specs": {
4
+ "gpu_count": 4,
5
+ "gpu_type": "L4",
6
  "vram_per_gpu": 24,
7
+ "total_vram": 96,
8
+ "vcpu_count": 48,
9
+ "ram": 186
10
  },
11
  "training_optimizations": {
12
+ "per_device_batch_size": 32,
13
+ "gradient_accumulation_steps": 2,
14
+ "effective_batch_size": 256,
15
  "memory_optimizations": {
16
  "use_gradient_checkpointing": true,
17
  "pin_memory": true,
18
+ "num_workers": 8,
19
+ "use_flash_attention": true
20
  },
21
  "distributed_settings": {
22
  "device_map": "auto",
23
+ "ddp_find_unused_parameters": false,
24
+ "use_fsdp": true,
25
+ "fsdp_config": {
26
+ "sharding_strategy": "FULL_SHARD",
27
+ "mixed_precision": "BF16",
28
+ "activation_checkpointing": true
29
+ }
30
  }
31
  },
32
  "memory_breakdown": {
33
  "model_size": "~3.5GB (pre-quantized 4-bit)",
34
  "optimizer_states": "~1GB",
35
+ "batch_memory_per_gpu": "~3GB",
36
+ "peak_memory_estimate": "~18GB",
37
+ "safe_headroom": "~6GB"
38
  },
39
+ "compute_environment": "L4_CLOUD",
40
+ "distributed_type": "FSDP",
41
  "mixed_precision": "bf16",
42
+ "num_gpus": 4,
43
  "training_parameters": {
44
+ "per_device_train_batch_size": 32,
45
+ "gradient_accumulation_steps": 2,
46
+ "dataloader_num_workers": 8,
47
  "dataloader_pin_memory": true,
48
  "gradient_checkpointing": true,
49
  "max_grad_norm": 1.0
transformers_config.json CHANGED
@@ -13,9 +13,9 @@
13
  },
14
 
15
  "training": {
16
- "per_device_train_batch_size": 16,
17
- "gradient_accumulation_steps": 4,
18
- "learning_rate": 2e-5,
19
  "num_train_epochs": 3,
20
  "max_steps": -1,
21
  "logging_steps": 10,
@@ -26,7 +26,7 @@
26
  "gradient_checkpointing": true,
27
  "optim": "adamw_torch",
28
  "lr_scheduler_type": "cosine",
29
- "warmup_ratio": 0.03,
30
  "weight_decay": 0.01,
31
  "max_grad_norm": 1.0,
32
  "neftune_noise_alpha": 5
@@ -56,6 +56,18 @@
56
  ]
57
  },
58
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  "logging": {
60
  "logging_steps": 50,
61
  "log_level": "info"
 
13
  },
14
 
15
  "training": {
16
+ "per_device_train_batch_size": 24,
17
+ "gradient_accumulation_steps": 2,
18
+ "learning_rate": 3e-5,
19
  "num_train_epochs": 3,
20
  "max_steps": -1,
21
  "logging_steps": 10,
 
26
  "gradient_checkpointing": true,
27
  "optim": "adamw_torch",
28
  "lr_scheduler_type": "cosine",
29
+ "warmup_ratio": 0.05,
30
  "weight_decay": 0.01,
31
  "max_grad_norm": 1.0,
32
  "neftune_noise_alpha": 5
 
56
  ]
57
  },
58
 
59
+ "distributed_training": {
60
+ "fsdp_config": {
61
+ "enabled": true,
62
+ "sharding_strategy": "FULL_SHARD",
63
+ "mixed_precision": "BF16",
64
+ "activation_checkpointing": true,
65
+ "offload_params": false
66
+ },
67
+ "ddp_find_unused_parameters": false,
68
+ "dataloader_num_workers": 8
69
+ },
70
+
71
  "logging": {
72
  "logging_steps": 50,
73
  "log_level": "info"
update_space.py CHANGED
@@ -31,7 +31,12 @@ def load_env_variables():
31
  from dotenv import load_dotenv
32
  env_path = Path(__file__).parent / ".env"
33
  if env_path.exists():
34
- load_dotenv(env_path)
 
 
 
 
 
35
  logger.info(f"Loaded environment variables from {env_path}")
36
  else:
37
  logger.warning(f"No .env file found at {env_path}")
@@ -53,10 +58,15 @@ def load_env_variables():
53
  "HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
54
  }
55
 
 
 
 
 
56
  missing_vars = [k for k, v in required_vars.items() if not v]
57
  if missing_vars:
58
  raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
59
 
 
60
  return required_vars
61
 
62
  def verify_configs():
@@ -138,7 +148,7 @@ def create_space(username, space_name):
138
  # Create new space
139
  try:
140
  api.create_repo(
141
- repo_id=space_name,
142
  private=False,
143
  repo_type="space",
144
  space_sdk="gradio"
@@ -181,8 +191,8 @@ def main():
181
  update_requirements()
182
  logger.info("Requirements updated successfully")
183
 
184
- # Get space name
185
- space_name = args.space_name or env_vars["HF_SPACE_NAME"]
186
  logger.info(f"Using space name: {space_name}")
187
 
188
  # Login to Hugging Face
 
31
  from dotenv import load_dotenv
32
  env_path = Path(__file__).parent / ".env"
33
  if env_path.exists():
34
+ # Load and explicitly set environment variables
35
+ with open(env_path) as f:
36
+ for line in f:
37
+ if line.strip() and not line.startswith('#'):
38
+ key, value = line.strip().split('=', 1)
39
+ os.environ[key] = value.strip()
40
  logger.info(f"Loaded environment variables from {env_path}")
41
  else:
42
  logger.warning(f"No .env file found at {env_path}")
 
58
  "HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
59
  }
60
 
61
+ # Ensure the space name is set correctly
62
+ if "HF_SPACE_NAME" not in os.environ:
63
+ os.environ["HF_SPACE_NAME"] = "phi4training"
64
+
65
  missing_vars = [k for k, v in required_vars.items() if not v]
66
  if missing_vars:
67
  raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
68
 
69
+ logger.info(f"Using environment variables: USERNAME={required_vars['HF_USERNAME']}, SPACE_NAME={required_vars['HF_SPACE_NAME']}")
70
  return required_vars
71
 
72
  def verify_configs():
 
148
  # Create new space
149
  try:
150
  api.create_repo(
151
+ repo_id=space_id,
152
  private=False,
153
  repo_type="space",
154
  space_sdk="gradio"
 
191
  update_requirements()
192
  logger.info("Requirements updated successfully")
193
 
194
+ # Get space name from args or env, prioritize args
195
+ space_name = args.space_name if args.space_name else env_vars["HF_SPACE_NAME"]
196
  logger.info(f"Using space name: {space_name}")
197
 
198
  # Login to Hugging Face