Teknium
commited on
Fix Deepspeed Zero3 Config (#791)
Browse files* Update zero3.json
Take away CPU Offload by default (Slows things down horribly, better off reducing batchsize), and changes LR Scheduler to a properly decaying one
* Update zero3.json
fix something
- deepspeed/zero3.json +3 -10
deepspeed/zero3.json
CHANGED
|
@@ -1,14 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"zero_optimization": {
|
| 3 |
"stage": 3,
|
| 4 |
-
"offload_optimizer": {
|
| 5 |
-
"device": "cpu",
|
| 6 |
-
"pin_memory": true
|
| 7 |
-
},
|
| 8 |
-
"offload_param": {
|
| 9 |
-
"device": "cpu",
|
| 10 |
-
"pin_memory": true
|
| 11 |
-
},
|
| 12 |
"overlap_comm": true,
|
| 13 |
"contiguous_gradients": true,
|
| 14 |
"sub_group_size": 0,
|
|
@@ -41,12 +33,13 @@
|
|
| 41 |
}
|
| 42 |
},
|
| 43 |
"scheduler": {
|
| 44 |
-
"type": "
|
| 45 |
"params": {
|
| 46 |
"warmup_min_lr": "auto",
|
| 47 |
"warmup_max_lr": "auto",
|
| 48 |
"warmup_num_steps": "auto",
|
| 49 |
-
"warmup_type": "linear"
|
|
|
|
| 50 |
}
|
| 51 |
},
|
| 52 |
"gradient_accumulation_steps": "auto",
|
|
|
|
| 1 |
{
|
| 2 |
"zero_optimization": {
|
| 3 |
"stage": 3,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"overlap_comm": true,
|
| 5 |
"contiguous_gradients": true,
|
| 6 |
"sub_group_size": 0,
|
|
|
|
| 33 |
}
|
| 34 |
},
|
| 35 |
"scheduler": {
|
| 36 |
+
"type": "WarmupDecayLR",
|
| 37 |
"params": {
|
| 38 |
"warmup_min_lr": "auto",
|
| 39 |
"warmup_max_lr": "auto",
|
| 40 |
"warmup_num_steps": "auto",
|
| 41 |
+
"warmup_type": "linear",
|
| 42 |
+
"total_num_steps": "auto"
|
| 43 |
}
|
| 44 |
},
|
| 45 |
"gradient_accumulation_steps": "auto",
|