Spaces:
Paused
Paused
Upload folder using huggingface_hub
Browse files- dataset_config.json +4 -3
- hardware_config.json +27 -20
- transformers_config.json +16 -4
- update_space.py +14 -4
dataset_config.json
CHANGED
|
@@ -26,11 +26,12 @@
|
|
| 26 |
}
|
| 27 |
},
|
| 28 |
"data_loading": {
|
| 29 |
-
"batch_size":
|
| 30 |
"shuffle": false,
|
| 31 |
"drop_last": false,
|
| 32 |
-
"num_workers":
|
| 33 |
-
"pin_memory":
|
|
|
|
| 34 |
},
|
| 35 |
"validation": {
|
| 36 |
"log_samples": 3,
|
|
|
|
| 26 |
}
|
| 27 |
},
|
| 28 |
"data_loading": {
|
| 29 |
+
"batch_size": 24,
|
| 30 |
"shuffle": false,
|
| 31 |
"drop_last": false,
|
| 32 |
+
"num_workers": 8,
|
| 33 |
+
"pin_memory": true,
|
| 34 |
+
"prefetch_factor": 4
|
| 35 |
},
|
| 36 |
"validation": {
|
| 37 |
"log_samples": 3,
|
hardware_config.json
CHANGED
|
@@ -1,42 +1,49 @@
|
|
| 1 |
{
|
| 2 |
-
"hardware_name": "
|
| 3 |
"specs": {
|
| 4 |
-
"gpu_count":
|
| 5 |
-
"gpu_type": "
|
| 6 |
"vram_per_gpu": 24,
|
| 7 |
-
"total_vram":
|
| 8 |
-
"vcpu_count":
|
| 9 |
-
"ram":
|
| 10 |
},
|
| 11 |
"training_optimizations": {
|
| 12 |
-
"per_device_batch_size":
|
| 13 |
-
"gradient_accumulation_steps":
|
| 14 |
-
"effective_batch_size":
|
| 15 |
"memory_optimizations": {
|
| 16 |
"use_gradient_checkpointing": true,
|
| 17 |
"pin_memory": true,
|
| 18 |
-
"num_workers":
|
|
|
|
| 19 |
},
|
| 20 |
"distributed_settings": {
|
| 21 |
"device_map": "auto",
|
| 22 |
-
"ddp_find_unused_parameters": false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
}
|
| 24 |
},
|
| 25 |
"memory_breakdown": {
|
| 26 |
"model_size": "~3.5GB (pre-quantized 4-bit)",
|
| 27 |
"optimizer_states": "~1GB",
|
| 28 |
-
"batch_memory_per_gpu": "~
|
| 29 |
-
"peak_memory_estimate": "
|
| 30 |
-
"safe_headroom": "
|
| 31 |
},
|
| 32 |
-
"compute_environment": "
|
| 33 |
-
"distributed_type": "
|
| 34 |
"mixed_precision": "bf16",
|
| 35 |
-
"num_gpus":
|
| 36 |
"training_parameters": {
|
| 37 |
-
"per_device_train_batch_size":
|
| 38 |
-
"gradient_accumulation_steps":
|
| 39 |
-
"dataloader_num_workers":
|
| 40 |
"dataloader_pin_memory": true,
|
| 41 |
"gradient_checkpointing": true,
|
| 42 |
"max_grad_norm": 1.0
|
|
|
|
| 1 |
{
|
| 2 |
+
"hardware_name": "4xL4",
|
| 3 |
"specs": {
|
| 4 |
+
"gpu_count": 4,
|
| 5 |
+
"gpu_type": "L4",
|
| 6 |
"vram_per_gpu": 24,
|
| 7 |
+
"total_vram": 96,
|
| 8 |
+
"vcpu_count": 48,
|
| 9 |
+
"ram": 186
|
| 10 |
},
|
| 11 |
"training_optimizations": {
|
| 12 |
+
"per_device_batch_size": 32,
|
| 13 |
+
"gradient_accumulation_steps": 2,
|
| 14 |
+
"effective_batch_size": 256,
|
| 15 |
"memory_optimizations": {
|
| 16 |
"use_gradient_checkpointing": true,
|
| 17 |
"pin_memory": true,
|
| 18 |
+
"num_workers": 8,
|
| 19 |
+
"use_flash_attention": true
|
| 20 |
},
|
| 21 |
"distributed_settings": {
|
| 22 |
"device_map": "auto",
|
| 23 |
+
"ddp_find_unused_parameters": false,
|
| 24 |
+
"use_fsdp": true,
|
| 25 |
+
"fsdp_config": {
|
| 26 |
+
"sharding_strategy": "FULL_SHARD",
|
| 27 |
+
"mixed_precision": "BF16",
|
| 28 |
+
"activation_checkpointing": true
|
| 29 |
+
}
|
| 30 |
}
|
| 31 |
},
|
| 32 |
"memory_breakdown": {
|
| 33 |
"model_size": "~3.5GB (pre-quantized 4-bit)",
|
| 34 |
"optimizer_states": "~1GB",
|
| 35 |
+
"batch_memory_per_gpu": "~3GB",
|
| 36 |
+
"peak_memory_estimate": "~18GB",
|
| 37 |
+
"safe_headroom": "~6GB"
|
| 38 |
},
|
| 39 |
+
"compute_environment": "L4_CLOUD",
|
| 40 |
+
"distributed_type": "FSDP",
|
| 41 |
"mixed_precision": "bf16",
|
| 42 |
+
"num_gpus": 4,
|
| 43 |
"training_parameters": {
|
| 44 |
+
"per_device_train_batch_size": 32,
|
| 45 |
+
"gradient_accumulation_steps": 2,
|
| 46 |
+
"dataloader_num_workers": 8,
|
| 47 |
"dataloader_pin_memory": true,
|
| 48 |
"gradient_checkpointing": true,
|
| 49 |
"max_grad_norm": 1.0
|
transformers_config.json
CHANGED
|
@@ -13,9 +13,9 @@
|
|
| 13 |
},
|
| 14 |
|
| 15 |
"training": {
|
| 16 |
-
"per_device_train_batch_size":
|
| 17 |
-
"gradient_accumulation_steps":
|
| 18 |
-
"learning_rate":
|
| 19 |
"num_train_epochs": 3,
|
| 20 |
"max_steps": -1,
|
| 21 |
"logging_steps": 10,
|
|
@@ -26,7 +26,7 @@
|
|
| 26 |
"gradient_checkpointing": true,
|
| 27 |
"optim": "adamw_torch",
|
| 28 |
"lr_scheduler_type": "cosine",
|
| 29 |
-
"warmup_ratio": 0.
|
| 30 |
"weight_decay": 0.01,
|
| 31 |
"max_grad_norm": 1.0,
|
| 32 |
"neftune_noise_alpha": 5
|
|
@@ -56,6 +56,18 @@
|
|
| 56 |
]
|
| 57 |
},
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
"logging": {
|
| 60 |
"logging_steps": 50,
|
| 61 |
"log_level": "info"
|
|
|
|
| 13 |
},
|
| 14 |
|
| 15 |
"training": {
|
| 16 |
+
"per_device_train_batch_size": 24,
|
| 17 |
+
"gradient_accumulation_steps": 2,
|
| 18 |
+
"learning_rate": 3e-5,
|
| 19 |
"num_train_epochs": 3,
|
| 20 |
"max_steps": -1,
|
| 21 |
"logging_steps": 10,
|
|
|
|
| 26 |
"gradient_checkpointing": true,
|
| 27 |
"optim": "adamw_torch",
|
| 28 |
"lr_scheduler_type": "cosine",
|
| 29 |
+
"warmup_ratio": 0.05,
|
| 30 |
"weight_decay": 0.01,
|
| 31 |
"max_grad_norm": 1.0,
|
| 32 |
"neftune_noise_alpha": 5
|
|
|
|
| 56 |
]
|
| 57 |
},
|
| 58 |
|
| 59 |
+
"distributed_training": {
|
| 60 |
+
"fsdp_config": {
|
| 61 |
+
"enabled": true,
|
| 62 |
+
"sharding_strategy": "FULL_SHARD",
|
| 63 |
+
"mixed_precision": "BF16",
|
| 64 |
+
"activation_checkpointing": true,
|
| 65 |
+
"offload_params": false
|
| 66 |
+
},
|
| 67 |
+
"ddp_find_unused_parameters": false,
|
| 68 |
+
"dataloader_num_workers": 8
|
| 69 |
+
},
|
| 70 |
+
|
| 71 |
"logging": {
|
| 72 |
"logging_steps": 50,
|
| 73 |
"log_level": "info"
|
update_space.py
CHANGED
|
@@ -31,7 +31,12 @@ def load_env_variables():
|
|
| 31 |
from dotenv import load_dotenv
|
| 32 |
env_path = Path(__file__).parent / ".env"
|
| 33 |
if env_path.exists():
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
logger.info(f"Loaded environment variables from {env_path}")
|
| 36 |
else:
|
| 37 |
logger.warning(f"No .env file found at {env_path}")
|
|
@@ -53,10 +58,15 @@ def load_env_variables():
|
|
| 53 |
"HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
|
| 54 |
}
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
missing_vars = [k for k, v in required_vars.items() if not v]
|
| 57 |
if missing_vars:
|
| 58 |
raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
|
| 59 |
|
|
|
|
| 60 |
return required_vars
|
| 61 |
|
| 62 |
def verify_configs():
|
|
@@ -138,7 +148,7 @@ def create_space(username, space_name):
|
|
| 138 |
# Create new space
|
| 139 |
try:
|
| 140 |
api.create_repo(
|
| 141 |
-
repo_id=
|
| 142 |
private=False,
|
| 143 |
repo_type="space",
|
| 144 |
space_sdk="gradio"
|
|
@@ -181,8 +191,8 @@ def main():
|
|
| 181 |
update_requirements()
|
| 182 |
logger.info("Requirements updated successfully")
|
| 183 |
|
| 184 |
-
# Get space name
|
| 185 |
-
space_name = args.space_name
|
| 186 |
logger.info(f"Using space name: {space_name}")
|
| 187 |
|
| 188 |
# Login to Hugging Face
|
|
|
|
| 31 |
from dotenv import load_dotenv
|
| 32 |
env_path = Path(__file__).parent / ".env"
|
| 33 |
if env_path.exists():
|
| 34 |
+
# Load and explicitly set environment variables
|
| 35 |
+
with open(env_path) as f:
|
| 36 |
+
for line in f:
|
| 37 |
+
if line.strip() and not line.startswith('#'):
|
| 38 |
+
key, value = line.strip().split('=', 1)
|
| 39 |
+
os.environ[key] = value.strip()
|
| 40 |
logger.info(f"Loaded environment variables from {env_path}")
|
| 41 |
else:
|
| 42 |
logger.warning(f"No .env file found at {env_path}")
|
|
|
|
| 58 |
"HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
|
| 59 |
}
|
| 60 |
|
| 61 |
+
# Ensure the space name is set correctly
|
| 62 |
+
if "HF_SPACE_NAME" not in os.environ:
|
| 63 |
+
os.environ["HF_SPACE_NAME"] = "phi4training"
|
| 64 |
+
|
| 65 |
missing_vars = [k for k, v in required_vars.items() if not v]
|
| 66 |
if missing_vars:
|
| 67 |
raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
|
| 68 |
|
| 69 |
+
logger.info(f"Using environment variables: USERNAME={required_vars['HF_USERNAME']}, SPACE_NAME={required_vars['HF_SPACE_NAME']}")
|
| 70 |
return required_vars
|
| 71 |
|
| 72 |
def verify_configs():
|
|
|
|
| 148 |
# Create new space
|
| 149 |
try:
|
| 150 |
api.create_repo(
|
| 151 |
+
repo_id=space_id,
|
| 152 |
private=False,
|
| 153 |
repo_type="space",
|
| 154 |
space_sdk="gradio"
|
|
|
|
| 191 |
update_requirements()
|
| 192 |
logger.info("Requirements updated successfully")
|
| 193 |
|
| 194 |
+
# Get space name from args or env, prioritize args
|
| 195 |
+
space_name = args.space_name if args.space_name else env_vars["HF_SPACE_NAME"]
|
| 196 |
logger.info(f"Using space name: {space_name}")
|
| 197 |
|
| 198 |
# Login to Hugging Face
|