Marko Tasic commited on
Commit
c2fc44c
·
1 Parent(s): 9b182f7

out/pretrain-core/final

Browse files
out/pretrain-core/final/hyperparameters.yaml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: tangled-alpha-0.1-core
2
+ model_config:
3
+ name: tangled-alpha-0.1-core
4
+ hf_config: {}
5
+ block_size: 131072
6
+ n_layer: 32
7
+ n_embd: 512
8
+ vocab_size: 32064
9
+ padding_multiple: 512
10
+ padded_vocab_size: 32064
11
+ norm_class_name: RMSNorm
12
+ norm_eps: 1.0e-05
13
+ norm_qk: false
14
+ post_attention_norm: false
15
+ post_mlp_norm: false
16
+ parallel_residual: false
17
+ shared_attention_norm: false
18
+ n_head: 4
19
+ head_size: 128
20
+ n_query_groups: 4
21
+ attn_bias: false
22
+ rope_base: 500000
23
+ rotary_percentage: 1.0
24
+ rope_condense_ratio: 1
25
+ rope_adjustments:
26
+ factor: 32.0
27
+ low_freq_factor: 1.0
28
+ high_freq_factor: 4.0
29
+ original_max_seq_len: 8192
30
+ intermediate_size: 2688
31
+ bias: false
32
+ mlp_class_name: LLaMAMLP
33
+ gelu_approximate: none
34
+ n_expert: 0
35
+ n_expert_per_token: 0
36
+ scale_embeddings: false
37
+ lm_head_bias: false
38
+ out_dir: ../out/pretrain-core
39
+ precision: bf16-true
40
+ resume: auto
41
+ data:
42
+ class_path: litgpt.data.LitData
43
+ init_args:
44
+ data_path: ../core-data-0-8192-2000/
45
+ seed: 42
46
+ num_workers: 32
47
+ train:
48
+ save_interval: 100
49
+ log_interval: 1
50
+ global_batch_size: 512
51
+ micro_batch_size: 2
52
+ lr_warmup_steps: 200
53
+ max_tokens: 7318364160
54
+ max_seq_length: 8192
55
+ tie_embeddings: true
56
+ max_norm: 1.0
57
+ min_lr: 1.0e-05
58
+ eval:
59
+ interval: 50
60
+ max_iters: 100
61
+ initial_validation: false
62
+ final_validation: true
63
+ evaluate_example: first
64
+ optimizer:
65
+ class_path: grokadamw.GrokAdamW
66
+ init_args:
67
+ lr: 0.0001
68
+ weight_decay: 0.01
69
+ betas:
70
+ - 0.9
71
+ - 0.999
72
+ devices: auto
73
+ num_nodes: 1
74
+ tokenizer_dir: ..
75
+ logger_name: wandb
76
+ seed: 23
out/pretrain-core/final/lit_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be0538fc89f974444adc06478e996ec1649bc85720e177e8830eddbc286a1a20
3
+ size 1457331426
out/pretrain-core/final/model_config.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ attention_logit_softcapping: null
2
+ attention_scores_scalar: null
3
+ attn_bias: false
4
+ bias: false
5
+ block_size: 131072
6
+ final_logit_softcapping: null
7
+ gelu_approximate: none
8
+ head_size: 128
9
+ hf_config: {}
10
+ intermediate_size: 2688
11
+ lm_head_bias: false
12
+ mlp_class_name: LLaMAMLP
13
+ n_embd: 512
14
+ n_expert: 0
15
+ n_expert_per_token: 0
16
+ n_head: 4
17
+ n_layer: 32
18
+ n_query_groups: 4
19
+ name: tangled-alpha-0.1-core
20
+ norm_class_name: RMSNorm
21
+ norm_eps: 1.0e-05
22
+ norm_qk: false
23
+ padded_vocab_size: 32064
24
+ padding_multiple: 512
25
+ parallel_residual: false
26
+ post_attention_norm: false
27
+ post_mlp_norm: false
28
+ rope_adjustments:
29
+ factor: 32.0
30
+ high_freq_factor: 4.0
31
+ low_freq_factor: 1.0
32
+ original_max_seq_len: 8192
33
+ rope_base: 500000
34
+ rope_condense_ratio: 1
35
+ rotary_percentage: 1.0
36
+ scale_embeddings: false
37
+ shared_attention_norm: false
38
+ sliding_window_layer_placing: null
39
+ sliding_window_size: null
40
+ vocab_size: 32064
out/pretrain-core/final/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83b2d408ebeae398f24964d4e7ce0c847cd7ff554519941355641c7d0f68b09b
3
+ size 1845893
out/pretrain-core/final/tokenizer_config.json ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": true,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "32000": {
30
+ "content": "<|endoftext|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<|assistant|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": true,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32002": {
46
+ "content": "<|placeholder1|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": true,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "32003": {
54
+ "content": "<|placeholder2|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": true,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "32004": {
62
+ "content": "<|placeholder3|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": true,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "32005": {
70
+ "content": "<|placeholder4|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": true,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "32006": {
78
+ "content": "<|system|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": true,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "32007": {
86
+ "content": "<|end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": true,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "32008": {
94
+ "content": "<|placeholder5|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": true,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "32009": {
102
+ "content": "<|placeholder6|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": true,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "32010": {
110
+ "content": "<|user|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": true,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "32011": {
118
+ "content": "<tools>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": true,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "32012": {
126
+ "content": "</tools>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": true,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "32013": {
134
+ "content": "<tool_call>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": true,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "32014": {
142
+ "content": "</tool_call>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": true,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "32015": {
150
+ "content": "<tool_response>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": true,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "32016": {
158
+ "content": "</tool_response>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": true,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "32017": {
166
+ "content": "<think>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": true,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "32018": {
174
+ "content": "</think>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": true,
178
+ "single_word": false,
179
+ "special": true
180
+ }
181
+ },
182
+ "bos_token": "<s>",
183
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>' + message['content'] + '<|end|>'}}{% elif message['role'] == 'user' %}{{'<|user|>' + message['content'] + '<|end|>'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + message['content'] + '<|end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
184
+ "clean_up_tokenization_spaces": false,
185
+ "eos_token": "<|endoftext|>",
186
+ "legacy": false,
187
+ "model_max_length": 131072,
188
+ "pad_token": "<|endoftext|>",
189
+ "padding_side": "left",
190
+ "sp_model_kwargs": {},
191
+ "tokenizer_class": "LlamaTokenizer",
192
+ "unk_token": "<unk>",
193
+ "use_default_system_prompt": false
194
+ }