piyushgrover commited on
Commit
373955f
·
verified ·
1 Parent(s): e164375

Upload config.yaml

Browse files
Files changed (1) hide show
  1. config.yaml +99 -0
config.yaml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoints:
2
+ checkpoint_interval: 500
3
+ checkpoints_path: checkpoints
4
+ checkpoints_path_is_shared_file_system: false
5
+ resume_checkpoint_path: null
6
+ save_final_state: false
7
+ save_initial_state: false
8
+ data_stages:
9
+ - data:
10
+ dataset:
11
+ dataset_folder:
12
+ - datasets/smollm2-corpus
13
+ dataset_weights:
14
+ - 1.0
15
+ num_loading_workers: 0
16
+ seed: 8
17
+ name: stable phase
18
+ start_training_step: 1
19
+ general:
20
+ benchmark_csv_path: null
21
+ consumed_train_samples: null
22
+ ignore_sanity_checks: true
23
+ project: smollm2
24
+ run: smollm2-135M
25
+ seed: 8
26
+ step: null
27
+ logging:
28
+ iteration_step_info_interval: 1
29
+ log_level: info
30
+ log_level_replica: info
31
+ model:
32
+ ddp_bucket_cap_mb: 25
33
+ dtype: bfloat16
34
+ init_method:
35
+ std: 0.041666666666666664
36
+ make_vocab_size_divisible_by: 1
37
+ model_config:
38
+ bos_token_id: 0
39
+ eos_token_id: 0
40
+ hidden_act: silu
41
+ hidden_size: 576
42
+ initializer_range: 0.041666666666666664
43
+ intermediate_size: 1536
44
+ is_llama_config: true
45
+ max_position_embeddings: 2048
46
+ num_attention_heads: 9
47
+ num_hidden_layers: 30
48
+ num_key_value_heads: 3
49
+ pad_token_id: null
50
+ pretraining_tp: 1
51
+ rms_norm_eps: 1.0e-05
52
+ rope_interleaved: false
53
+ rope_scaling: null
54
+ rope_theta: 10000.0
55
+ tie_word_embeddings: true
56
+ use_cache: true
57
+ vocab_size: 49152
58
+ optimizer:
59
+ accumulate_grad_in_fp32: true
60
+ clip_grad: 1.0
61
+ learning_rate_scheduler:
62
+ learning_rate: 0.003
63
+ lr_decay_starting_step: 1600000
64
+ lr_decay_steps: 400000
65
+ lr_decay_style: linear
66
+ lr_warmup_steps: 2000
67
+ lr_warmup_style: linear
68
+ min_decay_lr: 0
69
+ optimizer_factory:
70
+ adam_beta1: 0.9
71
+ adam_beta2: 0.95
72
+ adam_eps: 1.0e-08
73
+ name: adamW
74
+ torch_adam_is_fused: true
75
+ weight_decay: 0.01
76
+ zero_stage: 0
77
+ parallelism:
78
+ dp: 64
79
+ expert_parallel_size: 1
80
+ pp: 1
81
+ pp_engine: 1f1b
82
+ recompute_layer: false
83
+ tp: 1
84
+ tp_linear_async_communication: true
85
+ tp_mode: REDUCE_SCATTER
86
+ tp_recompute_allgather: true
87
+ profiler: null
88
+ tokenizer:
89
+ tokenizer_max_length: null
90
+ tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
91
+ tokenizer_revision: null
92
+ tokens:
93
+ batch_accumulation_per_replica: 1
94
+ limit_test_batches: 0
95
+ limit_val_batches: 0
96
+ micro_batch_size: 8
97
+ sequence_length: 2048
98
+ train_steps: 5000
99
+ val_check_interval: 500