Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- dataset_config.json +4 -3
- hardware_config.json +27 -20
- transformers_config.json +16 -4
- update_space.py +14 -4
dataset_config.json
CHANGED
@@ -26,11 +26,12 @@
|
|
26 |
}
|
27 |
},
|
28 |
"data_loading": {
|
29 |
-
"batch_size":
|
30 |
"shuffle": false,
|
31 |
"drop_last": false,
|
32 |
-
"num_workers":
|
33 |
-
"pin_memory":
|
|
|
34 |
},
|
35 |
"validation": {
|
36 |
"log_samples": 3,
|
|
|
26 |
}
|
27 |
},
|
28 |
"data_loading": {
|
29 |
+
"batch_size": 24,
|
30 |
"shuffle": false,
|
31 |
"drop_last": false,
|
32 |
+
"num_workers": 8,
|
33 |
+
"pin_memory": true,
|
34 |
+
"prefetch_factor": 4
|
35 |
},
|
36 |
"validation": {
|
37 |
"log_samples": 3,
|
hardware_config.json
CHANGED
@@ -1,42 +1,49 @@
|
|
1 |
{
|
2 |
-
"hardware_name": "
|
3 |
"specs": {
|
4 |
-
"gpu_count":
|
5 |
-
"gpu_type": "
|
6 |
"vram_per_gpu": 24,
|
7 |
-
"total_vram":
|
8 |
-
"vcpu_count":
|
9 |
-
"ram":
|
10 |
},
|
11 |
"training_optimizations": {
|
12 |
-
"per_device_batch_size":
|
13 |
-
"gradient_accumulation_steps":
|
14 |
-
"effective_batch_size":
|
15 |
"memory_optimizations": {
|
16 |
"use_gradient_checkpointing": true,
|
17 |
"pin_memory": true,
|
18 |
-
"num_workers":
|
|
|
19 |
},
|
20 |
"distributed_settings": {
|
21 |
"device_map": "auto",
|
22 |
-
"ddp_find_unused_parameters": false
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
}
|
24 |
},
|
25 |
"memory_breakdown": {
|
26 |
"model_size": "~3.5GB (pre-quantized 4-bit)",
|
27 |
"optimizer_states": "~1GB",
|
28 |
-
"batch_memory_per_gpu": "~
|
29 |
-
"peak_memory_estimate": "
|
30 |
-
"safe_headroom": "
|
31 |
},
|
32 |
-
"compute_environment": "
|
33 |
-
"distributed_type": "
|
34 |
"mixed_precision": "bf16",
|
35 |
-
"num_gpus":
|
36 |
"training_parameters": {
|
37 |
-
"per_device_train_batch_size":
|
38 |
-
"gradient_accumulation_steps":
|
39 |
-
"dataloader_num_workers":
|
40 |
"dataloader_pin_memory": true,
|
41 |
"gradient_checkpointing": true,
|
42 |
"max_grad_norm": 1.0
|
|
|
1 |
{
|
2 |
+
"hardware_name": "4xL4",
|
3 |
"specs": {
|
4 |
+
"gpu_count": 4,
|
5 |
+
"gpu_type": "L4",
|
6 |
"vram_per_gpu": 24,
|
7 |
+
"total_vram": 96,
|
8 |
+
"vcpu_count": 48,
|
9 |
+
"ram": 186
|
10 |
},
|
11 |
"training_optimizations": {
|
12 |
+
"per_device_batch_size": 32,
|
13 |
+
"gradient_accumulation_steps": 2,
|
14 |
+
"effective_batch_size": 256,
|
15 |
"memory_optimizations": {
|
16 |
"use_gradient_checkpointing": true,
|
17 |
"pin_memory": true,
|
18 |
+
"num_workers": 8,
|
19 |
+
"use_flash_attention": true
|
20 |
},
|
21 |
"distributed_settings": {
|
22 |
"device_map": "auto",
|
23 |
+
"ddp_find_unused_parameters": false,
|
24 |
+
"use_fsdp": true,
|
25 |
+
"fsdp_config": {
|
26 |
+
"sharding_strategy": "FULL_SHARD",
|
27 |
+
"mixed_precision": "BF16",
|
28 |
+
"activation_checkpointing": true
|
29 |
+
}
|
30 |
}
|
31 |
},
|
32 |
"memory_breakdown": {
|
33 |
"model_size": "~3.5GB (pre-quantized 4-bit)",
|
34 |
"optimizer_states": "~1GB",
|
35 |
+
"batch_memory_per_gpu": "~3GB",
|
36 |
+
"peak_memory_estimate": "~18GB",
|
37 |
+
"safe_headroom": "~6GB"
|
38 |
},
|
39 |
+
"compute_environment": "L4_CLOUD",
|
40 |
+
"distributed_type": "FSDP",
|
41 |
"mixed_precision": "bf16",
|
42 |
+
"num_gpus": 4,
|
43 |
"training_parameters": {
|
44 |
+
"per_device_train_batch_size": 32,
|
45 |
+
"gradient_accumulation_steps": 2,
|
46 |
+
"dataloader_num_workers": 8,
|
47 |
"dataloader_pin_memory": true,
|
48 |
"gradient_checkpointing": true,
|
49 |
"max_grad_norm": 1.0
|
transformers_config.json
CHANGED
@@ -13,9 +13,9 @@
|
|
13 |
},
|
14 |
|
15 |
"training": {
|
16 |
-
"per_device_train_batch_size":
|
17 |
-
"gradient_accumulation_steps":
|
18 |
-
"learning_rate":
|
19 |
"num_train_epochs": 3,
|
20 |
"max_steps": -1,
|
21 |
"logging_steps": 10,
|
@@ -26,7 +26,7 @@
|
|
26 |
"gradient_checkpointing": true,
|
27 |
"optim": "adamw_torch",
|
28 |
"lr_scheduler_type": "cosine",
|
29 |
-
"warmup_ratio": 0.
|
30 |
"weight_decay": 0.01,
|
31 |
"max_grad_norm": 1.0,
|
32 |
"neftune_noise_alpha": 5
|
@@ -56,6 +56,18 @@
|
|
56 |
]
|
57 |
},
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
"logging": {
|
60 |
"logging_steps": 50,
|
61 |
"log_level": "info"
|
|
|
13 |
},
|
14 |
|
15 |
"training": {
|
16 |
+
"per_device_train_batch_size": 24,
|
17 |
+
"gradient_accumulation_steps": 2,
|
18 |
+
"learning_rate": 3e-5,
|
19 |
"num_train_epochs": 3,
|
20 |
"max_steps": -1,
|
21 |
"logging_steps": 10,
|
|
|
26 |
"gradient_checkpointing": true,
|
27 |
"optim": "adamw_torch",
|
28 |
"lr_scheduler_type": "cosine",
|
29 |
+
"warmup_ratio": 0.05,
|
30 |
"weight_decay": 0.01,
|
31 |
"max_grad_norm": 1.0,
|
32 |
"neftune_noise_alpha": 5
|
|
|
56 |
]
|
57 |
},
|
58 |
|
59 |
+
"distributed_training": {
|
60 |
+
"fsdp_config": {
|
61 |
+
"enabled": true,
|
62 |
+
"sharding_strategy": "FULL_SHARD",
|
63 |
+
"mixed_precision": "BF16",
|
64 |
+
"activation_checkpointing": true,
|
65 |
+
"offload_params": false
|
66 |
+
},
|
67 |
+
"ddp_find_unused_parameters": false,
|
68 |
+
"dataloader_num_workers": 8
|
69 |
+
},
|
70 |
+
|
71 |
"logging": {
|
72 |
"logging_steps": 50,
|
73 |
"log_level": "info"
|
update_space.py
CHANGED
@@ -31,7 +31,12 @@ def load_env_variables():
|
|
31 |
from dotenv import load_dotenv
|
32 |
env_path = Path(__file__).parent / ".env"
|
33 |
if env_path.exists():
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
35 |
logger.info(f"Loaded environment variables from {env_path}")
|
36 |
else:
|
37 |
logger.warning(f"No .env file found at {env_path}")
|
@@ -53,10 +58,15 @@ def load_env_variables():
|
|
53 |
"HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
|
54 |
}
|
55 |
|
|
|
|
|
|
|
|
|
56 |
missing_vars = [k for k, v in required_vars.items() if not v]
|
57 |
if missing_vars:
|
58 |
raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
|
59 |
|
|
|
60 |
return required_vars
|
61 |
|
62 |
def verify_configs():
|
@@ -138,7 +148,7 @@ def create_space(username, space_name):
|
|
138 |
# Create new space
|
139 |
try:
|
140 |
api.create_repo(
|
141 |
-
repo_id=
|
142 |
private=False,
|
143 |
repo_type="space",
|
144 |
space_sdk="gradio"
|
@@ -181,8 +191,8 @@ def main():
|
|
181 |
update_requirements()
|
182 |
logger.info("Requirements updated successfully")
|
183 |
|
184 |
-
# Get space name
|
185 |
-
space_name = args.space_name
|
186 |
logger.info(f"Using space name: {space_name}")
|
187 |
|
188 |
# Login to Hugging Face
|
|
|
31 |
from dotenv import load_dotenv
|
32 |
env_path = Path(__file__).parent / ".env"
|
33 |
if env_path.exists():
|
34 |
+
# Load and explicitly set environment variables
|
35 |
+
with open(env_path) as f:
|
36 |
+
for line in f:
|
37 |
+
if line.strip() and not line.startswith('#'):
|
38 |
+
key, value = line.strip().split('=', 1)
|
39 |
+
os.environ[key] = value.strip()
|
40 |
logger.info(f"Loaded environment variables from {env_path}")
|
41 |
else:
|
42 |
logger.warning(f"No .env file found at {env_path}")
|
|
|
58 |
"HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
|
59 |
}
|
60 |
|
61 |
+
# Ensure the space name is set correctly
|
62 |
+
if "HF_SPACE_NAME" not in os.environ:
|
63 |
+
os.environ["HF_SPACE_NAME"] = "phi4training"
|
64 |
+
|
65 |
missing_vars = [k for k, v in required_vars.items() if not v]
|
66 |
if missing_vars:
|
67 |
raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
|
68 |
|
69 |
+
logger.info(f"Using environment variables: USERNAME={required_vars['HF_USERNAME']}, SPACE_NAME={required_vars['HF_SPACE_NAME']}")
|
70 |
return required_vars
|
71 |
|
72 |
def verify_configs():
|
|
|
148 |
# Create new space
|
149 |
try:
|
150 |
api.create_repo(
|
151 |
+
repo_id=space_id,
|
152 |
private=False,
|
153 |
repo_type="space",
|
154 |
space_sdk="gradio"
|
|
|
191 |
update_requirements()
|
192 |
logger.info("Requirements updated successfully")
|
193 |
|
194 |
+
# Get space name from args or env, prioritize args
|
195 |
+
space_name = args.space_name if args.space_name else env_vars["HF_SPACE_NAME"]
|
196 |
logger.info(f"Using space name: {space_name}")
|
197 |
|
198 |
# Login to Hugging Face
|