SystemAdmin123 commited on
Commit
3d5f23b
·
verified ·
1 Parent(s): d11fbdc

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/added_tokens.json CHANGED
@@ -1,5 +1,13 @@
1
  {
2
- "<|endoftext|>": 151643,
3
- "<|im_end|>": 151645,
4
- "<|im_start|>": 151644
 
 
 
 
 
 
 
 
5
  }
 
1
  {
2
+ "<|assistant|>": 32001,
3
+ "<|endoftext|>": 32000,
4
+ "<|end|>": 32007,
5
+ "<|placeholder1|>": 32002,
6
+ "<|placeholder2|>": 32003,
7
+ "<|placeholder3|>": 32004,
8
+ "<|placeholder4|>": 32005,
9
+ "<|placeholder5|>": 32008,
10
+ "<|placeholder6|>": 32009,
11
+ "<|system|>": 32006,
12
+ "<|user|>": 32010
13
  }
last-checkpoint/config.json CHANGED
@@ -1,32 +1,31 @@
1
  {
2
- "_name_or_path": "EleutherAI/pythia-70m-deduped",
3
  "architectures": [
4
- "GPTNeoXForCausalLM"
5
  ],
6
- "attention_bias": true,
7
  "attention_dropout": 0.0,
8
- "bos_token_id": 0,
9
- "classifier_dropout": 0.1,
10
- "eos_token_id": 0,
11
- "hidden_act": "gelu",
12
- "hidden_dropout": 0.0,
13
- "hidden_size": 512,
14
  "initializer_range": 0.02,
15
- "intermediate_size": 2048,
16
- "layer_norm_eps": 1e-05,
17
- "max_position_embeddings": 2048,
18
- "model_type": "gpt_neox",
19
- "num_attention_heads": 8,
20
- "num_hidden_layers": 6,
21
- "partial_rotary_factor": 0.25,
 
 
 
22
  "rope_scaling": null,
23
- "rope_theta": 10000,
24
- "rotary_emb_base": 10000,
25
- "rotary_pct": 0.25,
26
  "tie_word_embeddings": false,
27
  "torch_dtype": "bfloat16",
28
  "transformers_version": "4.48.1",
29
  "use_cache": false,
30
- "use_parallel_residual": true,
31
- "vocab_size": 50278
32
  }
 
1
  {
2
+ "_name_or_path": "Xenova/tiny-random-Phi3ForCausalLM",
3
  "architectures": [
4
+ "Phi3ForCausalLM"
5
  ],
 
6
  "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "embd_pdrop": 0.0,
9
+ "eos_token_id": 32000,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 32,
 
12
  "initializer_range": 0.02,
13
+ "intermediate_size": 64,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "phi3",
16
+ "num_attention_heads": 4,
17
+ "num_hidden_layers": 2,
18
+ "num_key_value_heads": 4,
19
+ "original_max_position_embeddings": 4096,
20
+ "pad_token_id": 32000,
21
+ "resid_pdrop": 0.0,
22
+ "rms_norm_eps": 1e-05,
23
  "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "sliding_window": 2047,
 
26
  "tie_word_embeddings": false,
27
  "torch_dtype": "bfloat16",
28
  "transformers_version": "4.48.1",
29
  "use_cache": false,
30
+ "vocab_size": 32011
 
31
  }
last-checkpoint/generation_config.json CHANGED
@@ -1,7 +1,8 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 0,
4
  "do_sample": true,
5
- "eos_token_id": 0,
 
6
  "transformers_version": "4.48.1"
7
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 1,
4
  "do_sample": true,
5
+ "eos_token_id": 32000,
6
+ "pad_token_id": 32000,
7
  "transformers_version": "4.48.1"
8
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b66b2fbb13b44a406572091d0b2ac5937222c2e44b31cf15bb807dc45c240c8b
3
- size 140808752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9887d0a75dcfd98e56a4c42489abac39f5f9adfc1e32e149aaacb6e76bd402cd
3
+ size 4140280
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cad578cc347021d7e8a9333a51bfcf0ee3486a63a50413fba892f9c8b083dbc
3
- size 143306874
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:069af421bb650b702a640dd7555d4a79b1545bb9de69c737913eb28166e68b67
3
+ size 4291766
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97474a95cf2d0e6166f036d8937e33ebebb2adb23cf1177f88edc10dc549c905
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9feae33b2fec0a6229240e7adaee6ecc8f5cfdf1a8bd0e827b1d8a241424e3c0
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf1f53caaa12767db3c6df563992bbf88f4b84dc57ec5080b22deb9c2c56ec6e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a673aaf85c0fe6b6c29cb8f3e7dbd829eef637110e4ad9a775f3fcf001c92591
3
  size 1064
last-checkpoint/special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "bos_token": {
3
- "content": "<|endoftext|>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
@@ -21,7 +21,7 @@
21
  "single_word": false
22
  },
23
  "unk_token": {
24
- "content": "<|endoftext|>",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
 
1
  {
2
  "bos_token": {
3
+ "content": "<s>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
 
21
  "single_word": false
22
  },
23
  "unk_token": {
24
+ "content": "<unk>",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
last-checkpoint/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c95aac127870725c031582253de8b2d45c9e3e009d893f89bec47a9b26f974aa
3
- size 3564484
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b013fe7282b7984bc14b1c64c2a70dd06b652a969810fbba6217f4ac70339f44
3
+ size 3621089
last-checkpoint/tokenizer_config.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "add_bos_token": false,
3
  "add_eos_token": false,
4
- "add_prefix_space": false,
5
  "added_tokens_decoder": {
6
  "0": {
7
- "content": "<|endoftext|>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
@@ -12,214 +12,122 @@
12
  "special": true
13
  },
14
  "1": {
15
- "content": "<|padding|>",
16
  "lstrip": false,
17
  "normalized": false,
18
  "rstrip": false,
19
  "single_word": false,
20
  "special": true
21
  },
22
- "50254": {
23
- "content": " ",
24
- "lstrip": false,
25
- "normalized": true,
26
- "rstrip": false,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "50255": {
31
- "content": " ",
32
- "lstrip": false,
33
- "normalized": true,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": false
37
- },
38
- "50256": {
39
- "content": " ",
40
- "lstrip": false,
41
- "normalized": true,
42
- "rstrip": false,
43
- "single_word": false,
44
- "special": false
45
- },
46
- "50257": {
47
- "content": " ",
48
- "lstrip": false,
49
- "normalized": true,
50
- "rstrip": false,
51
- "single_word": false,
52
- "special": false
53
- },
54
- "50258": {
55
- "content": " ",
56
- "lstrip": false,
57
- "normalized": true,
58
- "rstrip": false,
59
- "single_word": false,
60
- "special": false
61
- },
62
- "50259": {
63
- "content": " ",
64
- "lstrip": false,
65
- "normalized": true,
66
- "rstrip": false,
67
- "single_word": false,
68
- "special": false
69
- },
70
- "50260": {
71
- "content": " ",
72
- "lstrip": false,
73
- "normalized": true,
74
- "rstrip": false,
75
- "single_word": false,
76
- "special": false
77
- },
78
- "50261": {
79
- "content": " ",
80
- "lstrip": false,
81
- "normalized": true,
82
- "rstrip": false,
83
- "single_word": false,
84
- "special": false
85
- },
86
- "50262": {
87
- "content": " ",
88
- "lstrip": false,
89
- "normalized": true,
90
- "rstrip": false,
91
- "single_word": false,
92
- "special": false
93
- },
94
- "50263": {
95
- "content": " ",
96
- "lstrip": false,
97
- "normalized": true,
98
- "rstrip": false,
99
- "single_word": false,
100
- "special": false
101
- },
102
- "50264": {
103
- "content": " ",
104
- "lstrip": false,
105
- "normalized": true,
106
- "rstrip": false,
107
- "single_word": false,
108
- "special": false
109
- },
110
- "50265": {
111
- "content": " ",
112
  "lstrip": false,
113
- "normalized": true,
114
- "rstrip": false,
115
- "single_word": false,
116
- "special": false
117
- },
118
- "50266": {
119
- "content": " ",
120
- "lstrip": false,
121
- "normalized": true,
122
- "rstrip": false,
123
  "single_word": false,
124
  "special": false
125
  },
126
- "50267": {
127
- "content": " ",
128
  "lstrip": false,
129
- "normalized": true,
130
  "rstrip": false,
131
  "single_word": false,
132
- "special": false
133
  },
134
- "50268": {
135
- "content": " ",
136
  "lstrip": false,
137
- "normalized": true,
138
- "rstrip": false,
139
  "single_word": false,
140
- "special": false
141
  },
142
- "50269": {
143
- "content": " ",
144
  "lstrip": false,
145
- "normalized": true,
146
- "rstrip": false,
147
  "single_word": false,
148
- "special": false
149
  },
150
- "50270": {
151
- "content": " ",
152
  "lstrip": false,
153
- "normalized": true,
154
- "rstrip": false,
155
  "single_word": false,
156
- "special": false
157
  },
158
- "50271": {
159
- "content": " ",
160
  "lstrip": false,
161
- "normalized": true,
162
- "rstrip": false,
163
  "single_word": false,
164
- "special": false
165
  },
166
- "50272": {
167
- "content": " ",
168
  "lstrip": false,
169
- "normalized": true,
170
- "rstrip": false,
171
  "single_word": false,
172
- "special": false
173
  },
174
- "50273": {
175
- "content": " ",
176
  "lstrip": false,
177
- "normalized": true,
178
- "rstrip": false,
179
  "single_word": false,
180
- "special": false
181
  },
182
- "50274": {
183
- "content": " ",
184
  "lstrip": false,
185
- "normalized": true,
186
- "rstrip": false,
187
  "single_word": false,
188
- "special": false
189
  },
190
- "50275": {
191
- "content": " ",
192
  "lstrip": false,
193
- "normalized": true,
194
- "rstrip": false,
195
  "single_word": false,
196
- "special": false
197
  },
198
- "50276": {
199
- "content": " ",
200
  "lstrip": false,
201
- "normalized": true,
202
- "rstrip": false,
203
  "single_word": false,
204
- "special": false
205
  },
206
- "50277": {
207
- "content": "[PAD]",
208
  "lstrip": false,
209
  "normalized": false,
210
- "rstrip": false,
211
  "single_word": false,
212
  "special": true
213
  }
214
  },
215
- "bos_token": "<|endoftext|>",
216
- "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token}}{% endif %}{% endfor %}",
217
  "clean_up_tokenization_spaces": false,
218
  "eos_token": "<|endoftext|>",
219
  "extra_special_tokens": {},
220
- "model_max_length": 1000000000000000019884624838656,
 
221
  "pad_token": "<|endoftext|>",
222
- "tokenizer_class": "GPTNeoXTokenizer",
223
- "unk_token": "<|endoftext|>",
 
 
 
224
  "use_fast": true
225
  }
 
1
  {
2
+ "add_bos_token": true,
3
  "add_eos_token": false,
4
+ "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
7
+ "content": "<unk>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
 
12
  "special": true
13
  },
14
  "1": {
15
+ "content": "<s>",
16
  "lstrip": false,
17
  "normalized": false,
18
  "rstrip": false,
19
  "single_word": false,
20
  "special": true
21
  },
22
+ "2": {
23
+ "content": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": true,
 
 
 
 
 
 
 
 
27
  "single_word": false,
28
  "special": false
29
  },
30
+ "32000": {
31
+ "content": "<|endoftext|>",
32
  "lstrip": false,
33
+ "normalized": false,
34
  "rstrip": false,
35
  "single_word": false,
36
+ "special": true
37
  },
38
+ "32001": {
39
+ "content": "<|assistant|>",
40
  "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": true,
43
  "single_word": false,
44
+ "special": true
45
  },
46
+ "32002": {
47
+ "content": "<|placeholder1|>",
48
  "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": true,
51
  "single_word": false,
52
+ "special": true
53
  },
54
+ "32003": {
55
+ "content": "<|placeholder2|>",
56
  "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": true,
59
  "single_word": false,
60
+ "special": true
61
  },
62
+ "32004": {
63
+ "content": "<|placeholder3|>",
64
  "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": true,
67
  "single_word": false,
68
+ "special": true
69
  },
70
+ "32005": {
71
+ "content": "<|placeholder4|>",
72
  "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": true,
75
  "single_word": false,
76
+ "special": true
77
  },
78
+ "32006": {
79
+ "content": "<|system|>",
80
  "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": true,
83
  "single_word": false,
84
+ "special": true
85
  },
86
+ "32007": {
87
+ "content": "<|end|>",
88
  "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": true,
91
  "single_word": false,
92
+ "special": true
93
  },
94
+ "32008": {
95
+ "content": "<|placeholder5|>",
96
  "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": true,
99
  "single_word": false,
100
+ "special": true
101
  },
102
+ "32009": {
103
+ "content": "<|placeholder6|>",
104
  "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": true,
107
  "single_word": false,
108
+ "special": true
109
  },
110
+ "32010": {
111
+ "content": "<|user|>",
112
  "lstrip": false,
113
  "normalized": false,
114
+ "rstrip": true,
115
  "single_word": false,
116
  "special": true
117
  }
118
  },
119
+ "bos_token": "<s>",
120
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
  "extra_special_tokens": {},
124
+ "legacy": true,
125
+ "model_max_length": 4096,
126
  "pad_token": "<|endoftext|>",
127
+ "padding_side": "left",
128
+ "sp_model_kwargs": {},
129
+ "tokenizer_class": "LlamaTokenizer",
130
+ "unk_token": "<unk>",
131
+ "use_default_system_prompt": false,
132
  "use_fast": true
133
  }
last-checkpoint/trainer_state.json CHANGED
@@ -1,1796 +1,316 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7102693104468778,
5
  "eval_steps": 200,
6
- "global_step": 2400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0002959455460195324,
13
- "eval_loss": 52.514652252197266,
14
- "eval_runtime": 17.048,
15
- "eval_samples_per_second": 88.104,
16
- "eval_steps_per_second": 22.055,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.002959455460195324,
21
- "grad_norm": 2544.0,
22
  "learning_rate": 1.6000000000000003e-05,
23
- "loss": 12.0492,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.005918910920390648,
28
- "grad_norm": 3280.0,
29
  "learning_rate": 3.2000000000000005e-05,
30
- "loss": 20.6648,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 0.008878366380585973,
35
- "grad_norm": 3936.0,
36
  "learning_rate": 4.8e-05,
37
- "loss": 32.7406,
38
  "step": 30
39
  },
40
  {
41
  "epoch": 0.011837821840781295,
42
- "grad_norm": 6528.0,
43
  "learning_rate": 6.400000000000001e-05,
44
- "loss": 50.664,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 0.01479727730097662,
49
- "grad_norm": 692224.0,
50
  "learning_rate": 8e-05,
51
- "loss": 97.8355,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 0.017756732761171946,
56
- "grad_norm": 333824.0,
57
  "learning_rate": 9.6e-05,
58
- "loss": 16.9901,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.020716188221367268,
63
- "grad_norm": 103936.0,
64
  "learning_rate": 0.00011200000000000001,
65
- "loss": 15.0012,
66
  "step": 70
67
  },
68
  {
69
  "epoch": 0.02367564368156259,
70
- "grad_norm": 222208.0,
71
  "learning_rate": 0.00012800000000000002,
72
- "loss": 16.1127,
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.026635099141757917,
77
- "grad_norm": 103936.0,
78
  "learning_rate": 0.000144,
79
- "loss": 18.2763,
80
  "step": 90
81
  },
82
  {
83
  "epoch": 0.02959455460195324,
84
- "grad_norm": 80896.0,
85
  "learning_rate": 0.00016,
86
- "loss": 24.4986,
87
  "step": 100
88
  },
89
  {
90
  "epoch": 0.032554010062148565,
91
- "grad_norm": 56064.0,
92
  "learning_rate": 0.00017600000000000002,
93
- "loss": 13.195,
94
  "step": 110
95
  },
96
  {
97
  "epoch": 0.03551346552234389,
98
- "grad_norm": 1064.0,
99
  "learning_rate": 0.000192,
100
- "loss": 12.0022,
101
  "step": 120
102
  },
103
  {
104
  "epoch": 0.03847292098253921,
105
- "grad_norm": 1928.0,
106
  "learning_rate": 0.0001999978128380225,
107
- "loss": 11.3102,
108
  "step": 130
109
  },
110
  {
111
  "epoch": 0.041432376442734536,
112
- "grad_norm": 1968.0,
113
  "learning_rate": 0.0001999803161162393,
114
- "loss": 14.3508,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.04439183190292986,
119
- "grad_norm": 2976.0,
120
  "learning_rate": 0.00019994532573409262,
121
- "loss": 17.0822,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 0.04735128736312518,
126
- "grad_norm": 102.5,
127
  "learning_rate": 0.00019989284781388617,
128
- "loss": 12.2323,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 0.05031074282332051,
133
- "grad_norm": 215.0,
134
  "learning_rate": 0.00019982289153773646,
135
- "loss": 10.195,
136
  "step": 170
137
  },
138
  {
139
  "epoch": 0.053270198283515834,
140
- "grad_norm": 142.0,
141
  "learning_rate": 0.00019973546914596623,
142
- "loss": 9.5922,
143
  "step": 180
144
  },
145
  {
146
  "epoch": 0.05622965374371116,
147
- "grad_norm": 233.0,
148
  "learning_rate": 0.00019963059593496268,
149
- "loss": 9.5089,
150
  "step": 190
151
  },
152
  {
153
  "epoch": 0.05918910920390648,
154
- "grad_norm": 636.0,
155
  "learning_rate": 0.00019950829025450114,
156
- "loss": 12.0448,
157
  "step": 200
158
  },
159
  {
160
  "epoch": 0.05918910920390648,
161
- "eval_loss": 9.778441429138184,
162
- "eval_runtime": 11.7956,
163
- "eval_samples_per_second": 127.335,
164
- "eval_steps_per_second": 31.876,
165
  "step": 200
166
  },
167
  {
168
  "epoch": 0.062148564664101805,
169
- "grad_norm": 76.0,
170
  "learning_rate": 0.0001993685735045343,
171
- "loss": 11.7992,
172
  "step": 210
173
  },
174
  {
175
  "epoch": 0.06510802012429713,
176
- "grad_norm": 99.5,
177
  "learning_rate": 0.0001992114701314478,
178
- "loss": 8.7775,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.06806747558449246,
183
- "grad_norm": 106.0,
184
  "learning_rate": 0.000199037007623783,
185
- "loss": 8.9986,
186
  "step": 230
187
  },
188
  {
189
  "epoch": 0.07102693104468778,
190
- "grad_norm": 764.0,
191
  "learning_rate": 0.00019884521650742715,
192
- "loss": 9.6634,
193
  "step": 240
194
  },
195
  {
196
  "epoch": 0.0739863865048831,
197
- "grad_norm": 414.0,
198
  "learning_rate": 0.00019863613034027224,
199
- "loss": 9.5065,
200
  "step": 250
201
  },
202
  {
203
  "epoch": 0.07694584196507842,
204
- "grad_norm": 90.5,
205
  "learning_rate": 0.0001984097857063434,
206
- "loss": 8.6676,
207
  "step": 260
208
  },
209
  {
210
  "epoch": 0.07990529742527375,
211
- "grad_norm": 76.0,
212
  "learning_rate": 0.0001981662222093976,
213
- "loss": 8.8232,
214
  "step": 270
215
  },
216
  {
217
  "epoch": 0.08286475288546907,
218
- "grad_norm": 51.25,
219
  "learning_rate": 0.00019790548246599447,
220
- "loss": 8.543,
221
  "step": 280
222
  },
223
  {
224
  "epoch": 0.0858242083456644,
225
- "grad_norm": 2240.0,
226
  "learning_rate": 0.00019762761209803927,
227
- "loss": 8.9139,
228
  "step": 290
229
  },
230
  {
231
  "epoch": 0.08878366380585972,
232
- "grad_norm": 247.0,
233
  "learning_rate": 0.0001973326597248006,
234
- "loss": 8.6433,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 0.09174311926605505,
239
- "grad_norm": 44.0,
240
  "learning_rate": 0.00019702067695440332,
241
- "loss": 9.0974,
242
  "step": 310
243
  },
244
  {
245
  "epoch": 0.09470257472625036,
246
- "grad_norm": 135.0,
247
  "learning_rate": 0.00019669171837479873,
248
- "loss": 8.6146,
249
  "step": 320
250
  },
251
  {
252
  "epoch": 0.09766203018644569,
253
- "grad_norm": 80.0,
254
  "learning_rate": 0.00019634584154421317,
255
- "loss": 8.1196,
256
  "step": 330
257
  },
258
  {
259
  "epoch": 0.10062148564664102,
260
- "grad_norm": 372.0,
261
  "learning_rate": 0.00019598310698107702,
262
- "loss": 8.1078,
263
  "step": 340
264
  },
265
  {
266
  "epoch": 0.10358094110683634,
267
- "grad_norm": 496.0,
268
  "learning_rate": 0.00019560357815343577,
269
- "loss": 8.2415,
270
  "step": 350
271
  },
272
  {
273
  "epoch": 0.10654039656703167,
274
- "grad_norm": 47.75,
275
  "learning_rate": 0.00019520732146784491,
276
- "loss": 9.0675,
277
  "step": 360
278
  },
279
  {
280
  "epoch": 0.109499852027227,
281
- "grad_norm": 42.0,
282
  "learning_rate": 0.0001947944062577507,
283
- "loss": 8.3632,
284
  "step": 370
285
  },
286
  {
287
  "epoch": 0.11245930748742232,
288
- "grad_norm": 158.0,
289
  "learning_rate": 0.00019436490477135878,
290
- "loss": 8.1967,
291
  "step": 380
292
  },
293
  {
294
  "epoch": 0.11541876294761765,
295
- "grad_norm": 65.5,
296
  "learning_rate": 0.00019391889215899299,
297
- "loss": 8.29,
298
  "step": 390
299
  },
300
  {
301
  "epoch": 0.11837821840781296,
302
- "grad_norm": 278.0,
303
  "learning_rate": 0.0001934564464599461,
304
- "loss": 7.8939,
305
  "step": 400
306
  },
307
  {
308
  "epoch": 0.11837821840781296,
309
- "eval_loss": 8.981460571289062,
310
- "eval_runtime": 9.3003,
311
- "eval_samples_per_second": 161.501,
312
- "eval_steps_per_second": 40.429,
313
  "step": 400
314
- },
315
- {
316
- "epoch": 0.12133767386800828,
317
- "grad_norm": 61.0,
318
- "learning_rate": 0.00019297764858882514,
319
- "loss": 8.6452,
320
- "step": 410
321
- },
322
- {
323
- "epoch": 0.12429712932820361,
324
- "grad_norm": 50.0,
325
- "learning_rate": 0.00019248258232139388,
326
- "loss": 8.4218,
327
- "step": 420
328
- },
329
- {
330
- "epoch": 0.12725658478839894,
331
- "grad_norm": 58.0,
332
- "learning_rate": 0.00019197133427991436,
333
- "loss": 8.4002,
334
- "step": 430
335
- },
336
- {
337
- "epoch": 0.13021604024859426,
338
- "grad_norm": 588.0,
339
- "learning_rate": 0.00019144399391799043,
340
- "loss": 8.0756,
341
- "step": 440
342
- },
343
- {
344
- "epoch": 0.1331754957087896,
345
- "grad_norm": 358.0,
346
- "learning_rate": 0.00019090065350491626,
347
- "loss": 8.0585,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 0.1361349511689849,
352
- "grad_norm": 63.5,
353
- "learning_rate": 0.0001903414081095315,
354
- "loss": 8.504,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 0.13909440662918024,
359
- "grad_norm": 83.5,
360
- "learning_rate": 0.00018976635558358722,
361
- "loss": 8.4537,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 0.14205386208937557,
366
- "grad_norm": 154.0,
367
- "learning_rate": 0.00018917559654462474,
368
- "loss": 8.2744,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 0.1450133175495709,
373
- "grad_norm": 139.0,
374
- "learning_rate": 0.00018856923435837022,
375
- "loss": 7.9953,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 0.1479727730097662,
380
- "grad_norm": 300.0,
381
- "learning_rate": 0.0001879473751206489,
382
- "loss": 8.2303,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 0.15093222846996152,
387
- "grad_norm": 157.0,
388
- "learning_rate": 0.00018731012763882133,
389
- "loss": 8.2334,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 0.15389168393015684,
394
- "grad_norm": 145.0,
395
- "learning_rate": 0.00018665760341274505,
396
- "loss": 8.0997,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 0.15685113939035217,
401
- "grad_norm": 43.5,
402
- "learning_rate": 0.00018598991661526572,
403
- "loss": 8.0979,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 0.1598105948505475,
408
- "grad_norm": 53.5,
409
- "learning_rate": 0.00018530718407223974,
410
- "loss": 8.3276,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 0.16277005031074282,
415
- "grad_norm": 145.0,
416
- "learning_rate": 0.00018460952524209355,
417
- "loss": 7.9545,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 0.16572950577093815,
422
- "grad_norm": 49.75,
423
- "learning_rate": 0.00018389706219492147,
424
- "loss": 8.3634,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 0.16868896123113347,
429
- "grad_norm": 90.5,
430
- "learning_rate": 0.00018316991959112716,
431
- "loss": 7.8803,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 0.1716484166913288,
436
- "grad_norm": 61.75,
437
- "learning_rate": 0.00018242822465961176,
438
- "loss": 8.0672,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 0.17460787215152412,
443
- "grad_norm": 113.0,
444
- "learning_rate": 0.00018167210717551224,
445
- "loss": 7.9656,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 0.17756732761171945,
450
- "grad_norm": 194.0,
451
- "learning_rate": 0.00018090169943749476,
452
- "loss": 7.7851,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 0.17756732761171945,
457
- "eval_loss": 9.452142715454102,
458
- "eval_runtime": 9.0209,
459
- "eval_samples_per_second": 166.503,
460
- "eval_steps_per_second": 41.681,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 0.18052678307191478,
465
- "grad_norm": 336.0,
466
- "learning_rate": 0.00018011713624460608,
467
- "loss": 9.1377,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 0.1834862385321101,
472
- "grad_norm": 133.0,
473
- "learning_rate": 0.00017931855487268782,
474
- "loss": 7.9937,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 0.18644569399230543,
479
- "grad_norm": 456.0,
480
- "learning_rate": 0.0001785060950503568,
481
- "loss": 8.1474,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 0.18940514945250073,
486
- "grad_norm": 92.0,
487
- "learning_rate": 0.00017767989893455698,
488
- "loss": 8.0396,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 0.19236460491269605,
493
- "grad_norm": 1096.0,
494
- "learning_rate": 0.00017684011108568592,
495
- "loss": 8.8535,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 0.19532406037289138,
500
- "grad_norm": 41.5,
501
- "learning_rate": 0.00017598687844230088,
502
- "loss": 8.2106,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 0.1982835158330867,
507
- "grad_norm": 284.0,
508
- "learning_rate": 0.00017512035029540885,
509
- "loss": 8.3013,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 0.20124297129328203,
514
- "grad_norm": 39.0,
515
- "learning_rate": 0.000174240678262345,
516
- "loss": 8.1738,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 0.20420242675347736,
521
- "grad_norm": 107.5,
522
- "learning_rate": 0.000173348016260244,
523
- "loss": 8.4127,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 0.20716188221367268,
528
- "grad_norm": 288.0,
529
- "learning_rate": 0.00017244252047910892,
530
- "loss": 8.4413,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 0.210121337673868,
535
- "grad_norm": 55.5,
536
- "learning_rate": 0.00017152434935448256,
537
- "loss": 8.0362,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 0.21308079313406333,
542
- "grad_norm": 78.5,
543
- "learning_rate": 0.0001705936635397259,
544
- "loss": 7.9471,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 0.21604024859425866,
549
- "grad_norm": 62.75,
550
- "learning_rate": 0.00016965062587790823,
551
- "loss": 8.1659,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 0.218999704054454,
556
- "grad_norm": 56.5,
557
- "learning_rate": 0.00016869540137331445,
558
- "loss": 7.9207,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 0.2219591595146493,
563
- "grad_norm": 362.0,
564
- "learning_rate": 0.00016772815716257412,
565
- "loss": 8.1589,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 0.22491861497484464,
570
- "grad_norm": 22.875,
571
- "learning_rate": 0.00016674906248541726,
572
- "loss": 8.1283,
573
- "step": 760
574
- },
575
- {
576
- "epoch": 0.22787807043503996,
577
- "grad_norm": 482.0,
578
- "learning_rate": 0.00016575828865506245,
579
- "loss": 7.7056,
580
- "step": 770
581
- },
582
- {
583
- "epoch": 0.2308375258952353,
584
- "grad_norm": 72.5,
585
- "learning_rate": 0.0001647560090282419,
586
- "loss": 8.1033,
587
- "step": 780
588
- },
589
- {
590
- "epoch": 0.2337969813554306,
591
- "grad_norm": 116.5,
592
- "learning_rate": 0.000163742398974869,
593
- "loss": 7.6061,
594
- "step": 790
595
- },
596
- {
597
- "epoch": 0.23675643681562591,
598
- "grad_norm": 1448.0,
599
- "learning_rate": 0.0001627176358473537,
600
- "loss": 8.1142,
601
- "step": 800
602
- },
603
- {
604
- "epoch": 0.23675643681562591,
605
- "eval_loss": 9.41397762298584,
606
- "eval_runtime": 8.9895,
607
- "eval_samples_per_second": 167.083,
608
- "eval_steps_per_second": 41.826,
609
- "step": 800
610
- },
611
- {
612
- "epoch": 0.23971589227582124,
613
- "grad_norm": 35.0,
614
- "learning_rate": 0.0001616818989495711,
615
- "loss": 8.3766,
616
- "step": 810
617
- },
618
- {
619
- "epoch": 0.24267534773601657,
620
- "grad_norm": 27.375,
621
- "learning_rate": 0.00016063536950548826,
622
- "loss": 8.2034,
623
- "step": 820
624
- },
625
- {
626
- "epoch": 0.2456348031962119,
627
- "grad_norm": 40.0,
628
- "learning_rate": 0.0001595782306274553,
629
- "loss": 8.05,
630
- "step": 830
631
- },
632
- {
633
- "epoch": 0.24859425865640722,
634
- "grad_norm": 25.25,
635
- "learning_rate": 0.00015851066728416618,
636
- "loss": 7.7749,
637
- "step": 840
638
- },
639
- {
640
- "epoch": 0.25155371411660254,
641
- "grad_norm": 1848.0,
642
- "learning_rate": 0.00015743286626829437,
643
- "loss": 7.915,
644
- "step": 850
645
- },
646
- {
647
- "epoch": 0.25451316957679787,
648
- "grad_norm": 34.0,
649
- "learning_rate": 0.00015634501616380967,
650
- "loss": 7.951,
651
- "step": 860
652
- },
653
- {
654
- "epoch": 0.2574726250369932,
655
- "grad_norm": 53.25,
656
- "learning_rate": 0.00015524730731298134,
657
- "loss": 7.691,
658
- "step": 870
659
- },
660
- {
661
- "epoch": 0.2604320804971885,
662
- "grad_norm": 148.0,
663
- "learning_rate": 0.0001541399317830738,
664
- "loss": 7.915,
665
- "step": 880
666
- },
667
- {
668
- "epoch": 0.26339153595738385,
669
- "grad_norm": 78.0,
670
- "learning_rate": 0.0001530230833327405,
671
- "loss": 7.844,
672
- "step": 890
673
- },
674
- {
675
- "epoch": 0.2663509914175792,
676
- "grad_norm": 86.0,
677
- "learning_rate": 0.00015189695737812152,
678
- "loss": 7.5695,
679
- "step": 900
680
- },
681
- {
682
- "epoch": 0.2693104468777745,
683
- "grad_norm": 65.0,
684
- "learning_rate": 0.0001507617509586517,
685
- "loss": 7.9703,
686
- "step": 910
687
- },
688
- {
689
- "epoch": 0.2722699023379698,
690
- "grad_norm": 23.375,
691
- "learning_rate": 0.00014961766270258422,
692
- "loss": 7.4996,
693
- "step": 920
694
- },
695
- {
696
- "epoch": 0.27522935779816515,
697
- "grad_norm": 40.5,
698
- "learning_rate": 0.00014846489279223652,
699
- "loss": 7.7973,
700
- "step": 930
701
- },
702
- {
703
- "epoch": 0.2781888132583605,
704
- "grad_norm": 512.0,
705
- "learning_rate": 0.0001473036429289641,
706
- "loss": 7.9165,
707
- "step": 940
708
- },
709
- {
710
- "epoch": 0.2811482687185558,
711
- "grad_norm": 4320.0,
712
- "learning_rate": 0.0001461341162978688,
713
- "loss": 7.8394,
714
- "step": 950
715
- },
716
- {
717
- "epoch": 0.28410772417875113,
718
- "grad_norm": 146.0,
719
- "learning_rate": 0.00014495651753224705,
720
- "loss": 8.26,
721
- "step": 960
722
- },
723
- {
724
- "epoch": 0.28706717963894646,
725
- "grad_norm": 132.0,
726
- "learning_rate": 0.00014377105267778518,
727
- "loss": 8.2372,
728
- "step": 970
729
- },
730
- {
731
- "epoch": 0.2900266350991418,
732
- "grad_norm": 117.5,
733
- "learning_rate": 0.00014257792915650728,
734
- "loss": 8.0396,
735
- "step": 980
736
- },
737
- {
738
- "epoch": 0.2929860905593371,
739
- "grad_norm": 424.0,
740
- "learning_rate": 0.00014137735573048233,
741
- "loss": 8.1126,
742
- "step": 990
743
- },
744
- {
745
- "epoch": 0.2959455460195324,
746
- "grad_norm": 2224.0,
747
- "learning_rate": 0.00014016954246529696,
748
- "loss": 8.3142,
749
- "step": 1000
750
- },
751
- {
752
- "epoch": 0.2959455460195324,
753
- "eval_loss": 8.17757797241211,
754
- "eval_runtime": 9.0278,
755
- "eval_samples_per_second": 166.375,
756
- "eval_steps_per_second": 41.649,
757
- "step": 1000
758
- },
759
- {
760
- "epoch": 0.2989050014797277,
761
- "grad_norm": 100.5,
762
- "learning_rate": 0.00013895470069330004,
763
- "loss": 7.9549,
764
- "step": 1010
765
- },
766
- {
767
- "epoch": 0.30186445693992303,
768
- "grad_norm": 79.0,
769
- "learning_rate": 0.00013773304297662559,
770
- "loss": 7.8468,
771
- "step": 1020
772
- },
773
- {
774
- "epoch": 0.30482391240011836,
775
- "grad_norm": 308.0,
776
- "learning_rate": 0.00013650478307000057,
777
- "loss": 7.7471,
778
- "step": 1030
779
- },
780
- {
781
- "epoch": 0.3077833678603137,
782
- "grad_norm": 165.0,
783
- "learning_rate": 0.00013527013588334415,
784
- "loss": 7.8435,
785
- "step": 1040
786
- },
787
- {
788
- "epoch": 0.310742823320509,
789
- "grad_norm": 229.0,
790
- "learning_rate": 0.00013402931744416433,
791
- "loss": 7.9423,
792
- "step": 1050
793
- },
794
- {
795
- "epoch": 0.31370227878070434,
796
- "grad_norm": 584.0,
797
- "learning_rate": 0.00013278254485975976,
798
- "loss": 9.4635,
799
- "step": 1060
800
- },
801
- {
802
- "epoch": 0.31666173424089966,
803
- "grad_norm": 1128.0,
804
- "learning_rate": 0.00013153003627923218,
805
- "loss": 8.107,
806
- "step": 1070
807
- },
808
- {
809
- "epoch": 0.319621189701095,
810
- "grad_norm": 123.5,
811
- "learning_rate": 0.00013027201085531634,
812
- "loss": 8.1185,
813
- "step": 1080
814
- },
815
- {
816
- "epoch": 0.3225806451612903,
817
- "grad_norm": 95.5,
818
- "learning_rate": 0.00012900868870603503,
819
- "loss": 7.9646,
820
- "step": 1090
821
- },
822
- {
823
- "epoch": 0.32554010062148564,
824
- "grad_norm": 350.0,
825
- "learning_rate": 0.00012774029087618446,
826
- "loss": 8.2899,
827
- "step": 1100
828
- },
829
- {
830
- "epoch": 0.32849955608168097,
831
- "grad_norm": 70.0,
832
- "learning_rate": 0.00012646703929865817,
833
- "loss": 7.8757,
834
- "step": 1110
835
- },
836
- {
837
- "epoch": 0.3314590115418763,
838
- "grad_norm": 117.0,
839
- "learning_rate": 0.00012518915675561483,
840
- "loss": 7.6223,
841
- "step": 1120
842
- },
843
- {
844
- "epoch": 0.3344184670020716,
845
- "grad_norm": 294.0,
846
- "learning_rate": 0.00012390686683949798,
847
- "loss": 7.7403,
848
- "step": 1130
849
- },
850
- {
851
- "epoch": 0.33737792246226694,
852
- "grad_norm": 205.0,
853
- "learning_rate": 0.00012262039391391404,
854
- "loss": 7.7307,
855
- "step": 1140
856
- },
857
- {
858
- "epoch": 0.34033737792246227,
859
- "grad_norm": 588.0,
860
- "learning_rate": 0.0001213299630743747,
861
- "loss": 7.8997,
862
- "step": 1150
863
- },
864
- {
865
- "epoch": 0.3432968333826576,
866
- "grad_norm": 29.75,
867
- "learning_rate": 0.00012003580010891213,
868
- "loss": 7.8493,
869
- "step": 1160
870
- },
871
- {
872
- "epoch": 0.3462562888428529,
873
- "grad_norm": 50.75,
874
- "learning_rate": 0.00011873813145857249,
875
- "loss": 7.5136,
876
- "step": 1170
877
- },
878
- {
879
- "epoch": 0.34921574430304825,
880
- "grad_norm": 59.25,
881
- "learning_rate": 0.00011743718417779517,
882
- "loss": 8.0157,
883
- "step": 1180
884
- },
885
- {
886
- "epoch": 0.3521751997632436,
887
- "grad_norm": 108.5,
888
- "learning_rate": 0.00011613318589468511,
889
- "loss": 7.5891,
890
- "step": 1190
891
- },
892
- {
893
- "epoch": 0.3551346552234389,
894
- "grad_norm": 280.0,
895
- "learning_rate": 0.0001148263647711842,
896
- "loss": 7.7884,
897
- "step": 1200
898
- },
899
- {
900
- "epoch": 0.3551346552234389,
901
- "eval_loss": 8.000994682312012,
902
- "eval_runtime": 9.0516,
903
- "eval_samples_per_second": 165.937,
904
- "eval_steps_per_second": 41.539,
905
- "step": 1200
906
- },
907
- {
908
- "epoch": 0.3580941106836342,
909
- "grad_norm": 30.125,
910
- "learning_rate": 0.0001135169494631497,
911
- "loss": 8.0308,
912
- "step": 1210
913
- },
914
- {
915
- "epoch": 0.36105356614382955,
916
- "grad_norm": 37.75,
917
- "learning_rate": 0.00011220516908034601,
918
- "loss": 7.7083,
919
- "step": 1220
920
- },
921
- {
922
- "epoch": 0.3640130216040249,
923
- "grad_norm": 48.75,
924
- "learning_rate": 0.00011089125314635726,
925
- "loss": 7.7182,
926
- "step": 1230
927
- },
928
- {
929
- "epoch": 0.3669724770642202,
930
- "grad_norm": 39.5,
931
- "learning_rate": 0.00010957543155842702,
932
- "loss": 7.6788,
933
- "step": 1240
934
- },
935
- {
936
- "epoch": 0.36993193252441553,
937
- "grad_norm": 46.75,
938
- "learning_rate": 0.00010825793454723325,
939
- "loss": 7.9517,
940
- "step": 1250
941
- },
942
- {
943
- "epoch": 0.37289138798461086,
944
- "grad_norm": 33.5,
945
- "learning_rate": 0.00010693899263660441,
946
- "loss": 7.821,
947
- "step": 1260
948
- },
949
- {
950
- "epoch": 0.3758508434448062,
951
- "grad_norm": 51.25,
952
- "learning_rate": 0.00010561883660318455,
953
- "loss": 7.7532,
954
- "step": 1270
955
- },
956
- {
957
- "epoch": 0.37881029890500145,
958
- "grad_norm": 82.0,
959
- "learning_rate": 0.00010429769743605407,
960
- "loss": 7.6321,
961
- "step": 1280
962
- },
963
- {
964
- "epoch": 0.3817697543651968,
965
- "grad_norm": 38.5,
966
- "learning_rate": 0.00010297580629631325,
967
- "loss": 7.5741,
968
- "step": 1290
969
- },
970
- {
971
- "epoch": 0.3847292098253921,
972
- "grad_norm": 292.0,
973
- "learning_rate": 0.00010165339447663587,
974
- "loss": 7.2796,
975
- "step": 1300
976
- },
977
- {
978
- "epoch": 0.38768866528558743,
979
- "grad_norm": 76.5,
980
- "learning_rate": 0.00010033069336079952,
981
- "loss": 7.926,
982
- "step": 1310
983
- },
984
- {
985
- "epoch": 0.39064812074578276,
986
- "grad_norm": 64.0,
987
- "learning_rate": 9.900793438320037e-05,
988
- "loss": 7.5464,
989
- "step": 1320
990
- },
991
- {
992
- "epoch": 0.3936075762059781,
993
- "grad_norm": 120.5,
994
- "learning_rate": 9.768534898835862e-05,
995
- "loss": 7.5249,
996
- "step": 1330
997
- },
998
- {
999
- "epoch": 0.3965670316661734,
1000
- "grad_norm": 108.5,
1001
- "learning_rate": 9.636316859042259e-05,
1002
- "loss": 7.4919,
1003
- "step": 1340
1004
- },
1005
- {
1006
- "epoch": 0.39952648712636873,
1007
- "grad_norm": 972.0,
1008
- "learning_rate": 9.504162453267777e-05,
1009
- "loss": 7.332,
1010
- "step": 1350
1011
- },
1012
- {
1013
- "epoch": 0.40248594258656406,
1014
- "grad_norm": 328.0,
1015
- "learning_rate": 9.372094804706867e-05,
1016
- "loss": 7.6553,
1017
- "step": 1360
1018
- },
1019
- {
1020
- "epoch": 0.4054453980467594,
1021
- "grad_norm": 154.0,
1022
- "learning_rate": 9.24013702137397e-05,
1023
- "loss": 7.5773,
1024
- "step": 1370
1025
- },
1026
- {
1027
- "epoch": 0.4084048535069547,
1028
- "grad_norm": 43.75,
1029
- "learning_rate": 9.108312192060298e-05,
1030
- "loss": 7.5746,
1031
- "step": 1380
1032
- },
1033
- {
1034
- "epoch": 0.41136430896715004,
1035
- "grad_norm": 310.0,
1036
- "learning_rate": 8.97664338229395e-05,
1037
- "loss": 7.6845,
1038
- "step": 1390
1039
- },
1040
- {
1041
- "epoch": 0.41432376442734536,
1042
- "grad_norm": 336.0,
1043
- "learning_rate": 8.845153630304139e-05,
1044
- "loss": 7.449,
1045
- "step": 1400
1046
- },
1047
- {
1048
- "epoch": 0.41432376442734536,
1049
- "eval_loss": 7.618994235992432,
1050
- "eval_runtime": 9.0496,
1051
- "eval_samples_per_second": 165.975,
1052
- "eval_steps_per_second": 41.549,
1053
- "step": 1400
1054
- },
1055
- {
1056
- "epoch": 0.4172832198875407,
1057
- "grad_norm": 39.5,
1058
- "learning_rate": 8.713865942990141e-05,
1059
- "loss": 7.6821,
1060
- "step": 1410
1061
- },
1062
- {
1063
- "epoch": 0.420242675347736,
1064
- "grad_norm": 76.5,
1065
- "learning_rate": 8.582803291895758e-05,
1066
- "loss": 7.5631,
1067
- "step": 1420
1068
- },
1069
- {
1070
- "epoch": 0.42320213080793134,
1071
- "grad_norm": 167.0,
1072
- "learning_rate": 8.451988609189987e-05,
1073
- "loss": 7.7359,
1074
- "step": 1430
1075
- },
1076
- {
1077
- "epoch": 0.42616158626812667,
1078
- "grad_norm": 268.0,
1079
- "learning_rate": 8.321444783654524e-05,
1080
- "loss": 7.4941,
1081
- "step": 1440
1082
- },
1083
- {
1084
- "epoch": 0.429121041728322,
1085
- "grad_norm": 103.5,
1086
- "learning_rate": 8.191194656678904e-05,
1087
- "loss": 7.4398,
1088
- "step": 1450
1089
- },
1090
- {
1091
- "epoch": 0.4320804971885173,
1092
- "grad_norm": 66.0,
1093
- "learning_rate": 8.061261018263919e-05,
1094
- "loss": 7.5629,
1095
- "step": 1460
1096
- },
1097
- {
1098
- "epoch": 0.43503995264871265,
1099
- "grad_norm": 24.0,
1100
- "learning_rate": 7.931666603034033e-05,
1101
- "loss": 7.5025,
1102
- "step": 1470
1103
- },
1104
- {
1105
- "epoch": 0.437999408108908,
1106
- "grad_norm": 142.0,
1107
- "learning_rate": 7.80243408625947e-05,
1108
- "loss": 7.4416,
1109
- "step": 1480
1110
- },
1111
- {
1112
- "epoch": 0.4409588635691033,
1113
- "grad_norm": 1488.0,
1114
- "learning_rate": 7.673586079888698e-05,
1115
- "loss": 7.4872,
1116
- "step": 1490
1117
- },
1118
- {
1119
- "epoch": 0.4439183190292986,
1120
- "grad_norm": 892.0,
1121
- "learning_rate": 7.54514512859201e-05,
1122
- "loss": 7.4681,
1123
- "step": 1500
1124
- },
1125
- {
1126
- "epoch": 0.44687777448949395,
1127
- "grad_norm": 92.5,
1128
- "learning_rate": 7.417133705816837e-05,
1129
- "loss": 7.4869,
1130
- "step": 1510
1131
- },
1132
- {
1133
- "epoch": 0.4498372299496893,
1134
- "grad_norm": 160.0,
1135
- "learning_rate": 7.289574209855559e-05,
1136
- "loss": 7.7108,
1137
- "step": 1520
1138
- },
1139
- {
1140
- "epoch": 0.4527966854098846,
1141
- "grad_norm": 211.0,
1142
- "learning_rate": 7.16248895992645e-05,
1143
- "loss": 7.5492,
1144
- "step": 1530
1145
- },
1146
- {
1147
- "epoch": 0.45575614087007993,
1148
- "grad_norm": 135.0,
1149
- "learning_rate": 7.035900192268464e-05,
1150
- "loss": 7.409,
1151
- "step": 1540
1152
- },
1153
- {
1154
- "epoch": 0.45871559633027525,
1155
- "grad_norm": 124.5,
1156
- "learning_rate": 6.909830056250527e-05,
1157
- "loss": 7.3087,
1158
- "step": 1550
1159
- },
1160
- {
1161
- "epoch": 0.4616750517904706,
1162
- "grad_norm": 133.0,
1163
- "learning_rate": 6.784300610496048e-05,
1164
- "loss": 7.9377,
1165
- "step": 1560
1166
- },
1167
- {
1168
- "epoch": 0.46463450725066585,
1169
- "grad_norm": 225.0,
1170
- "learning_rate": 6.65933381902329e-05,
1171
- "loss": 7.5162,
1172
- "step": 1570
1173
- },
1174
- {
1175
- "epoch": 0.4675939627108612,
1176
- "grad_norm": 186.0,
1177
- "learning_rate": 6.534951547402322e-05,
1178
- "loss": 7.7924,
1179
- "step": 1580
1180
- },
1181
- {
1182
- "epoch": 0.4705534181710565,
1183
- "grad_norm": 404.0,
1184
- "learning_rate": 6.411175558929152e-05,
1185
- "loss": 7.579,
1186
- "step": 1590
1187
- },
1188
- {
1189
- "epoch": 0.47351287363125183,
1190
- "grad_norm": 600.0,
1191
- "learning_rate": 6.28802751081779e-05,
1192
- "loss": 7.3673,
1193
- "step": 1600
1194
- },
1195
- {
1196
- "epoch": 0.47351287363125183,
1197
- "eval_loss": 7.5222601890563965,
1198
- "eval_runtime": 10.7021,
1199
- "eval_samples_per_second": 140.346,
1200
- "eval_steps_per_second": 35.133,
1201
- "step": 1600
1202
- },
1203
- {
1204
- "epoch": 0.47647232909144716,
1205
- "grad_norm": 306.0,
1206
- "learning_rate": 6.165528950410884e-05,
1207
- "loss": 7.6311,
1208
- "step": 1610
1209
- },
1210
- {
1211
- "epoch": 0.4794317845516425,
1212
- "grad_norm": 1808.0,
1213
- "learning_rate": 6.0437013114095195e-05,
1214
- "loss": 7.5535,
1215
- "step": 1620
1216
- },
1217
- {
1218
- "epoch": 0.4823912400118378,
1219
- "grad_norm": 151.0,
1220
- "learning_rate": 5.922565910122967e-05,
1221
- "loss": 7.4998,
1222
- "step": 1630
1223
- },
1224
- {
1225
- "epoch": 0.48535069547203313,
1226
- "grad_norm": 792.0,
1227
- "learning_rate": 5.8021439417389444e-05,
1228
- "loss": 7.2634,
1229
- "step": 1640
1230
- },
1231
- {
1232
- "epoch": 0.48831015093222846,
1233
- "grad_norm": 80.5,
1234
- "learning_rate": 5.6824564766150726e-05,
1235
- "loss": 7.0989,
1236
- "step": 1650
1237
- },
1238
- {
1239
- "epoch": 0.4912696063924238,
1240
- "grad_norm": 172.0,
1241
- "learning_rate": 5.563524456592163e-05,
1242
- "loss": 7.6032,
1243
- "step": 1660
1244
- },
1245
- {
1246
- "epoch": 0.4942290618526191,
1247
- "grad_norm": 348.0,
1248
- "learning_rate": 5.4453686913300074e-05,
1249
- "loss": 7.2613,
1250
- "step": 1670
1251
- },
1252
- {
1253
- "epoch": 0.49718851731281444,
1254
- "grad_norm": 54.75,
1255
- "learning_rate": 5.328009854666303e-05,
1256
- "loss": 7.434,
1257
- "step": 1680
1258
- },
1259
- {
1260
- "epoch": 0.5001479727730098,
1261
- "grad_norm": 122.0,
1262
- "learning_rate": 5.2114684809993044e-05,
1263
- "loss": 7.3746,
1264
- "step": 1690
1265
- },
1266
- {
1267
- "epoch": 0.5031074282332051,
1268
- "grad_norm": 72.0,
1269
- "learning_rate": 5.095764961694922e-05,
1270
- "loss": 7.5639,
1271
- "step": 1700
1272
- },
1273
- {
1274
- "epoch": 0.5060668836934004,
1275
- "grad_norm": 59.5,
1276
- "learning_rate": 4.980919541518796e-05,
1277
- "loss": 7.8365,
1278
- "step": 1710
1279
- },
1280
- {
1281
- "epoch": 0.5090263391535957,
1282
- "grad_norm": 113.5,
1283
- "learning_rate": 4.866952315094088e-05,
1284
- "loss": 7.4722,
1285
- "step": 1720
1286
- },
1287
- {
1288
- "epoch": 0.511985794613791,
1289
- "grad_norm": 92.0,
1290
- "learning_rate": 4.753883223385467e-05,
1291
- "loss": 7.6937,
1292
- "step": 1730
1293
- },
1294
- {
1295
- "epoch": 0.5149452500739864,
1296
- "grad_norm": 178.0,
1297
- "learning_rate": 4.6417320502100316e-05,
1298
- "loss": 7.4405,
1299
- "step": 1740
1300
- },
1301
- {
1302
- "epoch": 0.5179047055341817,
1303
- "grad_norm": 72.5,
1304
- "learning_rate": 4.530518418775733e-05,
1305
- "loss": 7.4937,
1306
- "step": 1750
1307
- },
1308
- {
1309
- "epoch": 0.520864160994377,
1310
- "grad_norm": 25.625,
1311
- "learning_rate": 4.4202617882478405e-05,
1312
- "loss": 7.9844,
1313
- "step": 1760
1314
- },
1315
- {
1316
- "epoch": 0.5238236164545723,
1317
- "grad_norm": 86.5,
1318
- "learning_rate": 4.310981450344189e-05,
1319
- "loss": 7.3943,
1320
- "step": 1770
1321
- },
1322
- {
1323
- "epoch": 0.5267830719147677,
1324
- "grad_norm": 229.0,
1325
- "learning_rate": 4.2026965259596666e-05,
1326
- "loss": 7.6026,
1327
- "step": 1780
1328
- },
1329
- {
1330
- "epoch": 0.529742527374963,
1331
- "grad_norm": 35.0,
1332
- "learning_rate": 4.0954259618206295e-05,
1333
- "loss": 7.4426,
1334
- "step": 1790
1335
- },
1336
- {
1337
- "epoch": 0.5327019828351583,
1338
- "grad_norm": 110.5,
1339
- "learning_rate": 3.9891885271697496e-05,
1340
- "loss": 7.7448,
1341
- "step": 1800
1342
- },
1343
- {
1344
- "epoch": 0.5327019828351583,
1345
- "eval_loss": 7.529196739196777,
1346
- "eval_runtime": 9.1371,
1347
- "eval_samples_per_second": 164.385,
1348
- "eval_steps_per_second": 41.151,
1349
- "step": 1800
1350
- },
1351
- {
1352
- "epoch": 0.5356614382953536,
1353
- "grad_norm": 63.25,
1354
- "learning_rate": 3.884002810481958e-05,
1355
- "loss": 7.6213,
1356
- "step": 1810
1357
- },
1358
- {
1359
- "epoch": 0.538620893755549,
1360
- "grad_norm": 308.0,
1361
- "learning_rate": 3.779887216211995e-05,
1362
- "loss": 7.3711,
1363
- "step": 1820
1364
- },
1365
- {
1366
- "epoch": 0.5415803492157443,
1367
- "grad_norm": 75.5,
1368
- "learning_rate": 3.676859961574162e-05,
1369
- "loss": 7.2565,
1370
- "step": 1830
1371
- },
1372
- {
1373
- "epoch": 0.5445398046759397,
1374
- "grad_norm": 85.5,
1375
- "learning_rate": 3.574939073354838e-05,
1376
- "loss": 7.5128,
1377
- "step": 1840
1378
- },
1379
- {
1380
- "epoch": 0.5474992601361349,
1381
- "grad_norm": 151.0,
1382
- "learning_rate": 3.4741423847583134e-05,
1383
- "loss": 7.4537,
1384
- "step": 1850
1385
- },
1386
- {
1387
- "epoch": 0.5504587155963303,
1388
- "grad_norm": 206.0,
1389
- "learning_rate": 3.3744875322865034e-05,
1390
- "loss": 7.6777,
1391
- "step": 1860
1392
- },
1393
- {
1394
- "epoch": 0.5534181710565256,
1395
- "grad_norm": 51.0,
1396
- "learning_rate": 3.275991952653054e-05,
1397
- "loss": 7.3127,
1398
- "step": 1870
1399
- },
1400
- {
1401
- "epoch": 0.556377626516721,
1402
- "grad_norm": 254.0,
1403
- "learning_rate": 3.178672879732435e-05,
1404
- "loss": 7.5413,
1405
- "step": 1880
1406
- },
1407
- {
1408
- "epoch": 0.5593370819769162,
1409
- "grad_norm": 510.0,
1410
- "learning_rate": 3.0825473415445074e-05,
1411
- "loss": 7.3452,
1412
- "step": 1890
1413
- },
1414
- {
1415
- "epoch": 0.5622965374371116,
1416
- "grad_norm": 972.0,
1417
- "learning_rate": 2.9876321572751144e-05,
1418
- "loss": 7.8002,
1419
- "step": 1900
1420
- },
1421
- {
1422
- "epoch": 0.5652559928973069,
1423
- "grad_norm": 135.0,
1424
- "learning_rate": 2.8939439343332086e-05,
1425
- "loss": 7.638,
1426
- "step": 1910
1427
- },
1428
- {
1429
- "epoch": 0.5682154483575023,
1430
- "grad_norm": 91.0,
1431
- "learning_rate": 2.8014990654450325e-05,
1432
- "loss": 7.1467,
1433
- "step": 1920
1434
- },
1435
- {
1436
- "epoch": 0.5711749038176975,
1437
- "grad_norm": 189.0,
1438
- "learning_rate": 2.7103137257858868e-05,
1439
- "loss": 7.1859,
1440
- "step": 1930
1441
- },
1442
- {
1443
- "epoch": 0.5741343592778929,
1444
- "grad_norm": 82.0,
1445
- "learning_rate": 2.6204038701499056e-05,
1446
- "loss": 7.5085,
1447
- "step": 1940
1448
- },
1449
- {
1450
- "epoch": 0.5770938147380882,
1451
- "grad_norm": 344.0,
1452
- "learning_rate": 2.5317852301584643e-05,
1453
- "loss": 7.0502,
1454
- "step": 1950
1455
- },
1456
- {
1457
- "epoch": 0.5800532701982836,
1458
- "grad_norm": 168.0,
1459
- "learning_rate": 2.4444733115075823e-05,
1460
- "loss": 7.6417,
1461
- "step": 1960
1462
- },
1463
- {
1464
- "epoch": 0.5830127256584788,
1465
- "grad_norm": 77.5,
1466
- "learning_rate": 2.3584833912548888e-05,
1467
- "loss": 7.4358,
1468
- "step": 1970
1469
- },
1470
- {
1471
- "epoch": 0.5859721811186742,
1472
- "grad_norm": 30.625,
1473
- "learning_rate": 2.2738305151465645e-05,
1474
- "loss": 7.329,
1475
- "step": 1980
1476
- },
1477
- {
1478
- "epoch": 0.5889316365788695,
1479
- "grad_norm": 474.0,
1480
- "learning_rate": 2.190529494984782e-05,
1481
- "loss": 7.4855,
1482
- "step": 1990
1483
- },
1484
- {
1485
- "epoch": 0.5918910920390648,
1486
- "grad_norm": 115.5,
1487
- "learning_rate": 2.1085949060360654e-05,
1488
- "loss": 7.6519,
1489
- "step": 2000
1490
- },
1491
- {
1492
- "epoch": 0.5918910920390648,
1493
- "eval_loss": 7.52925443649292,
1494
- "eval_runtime": 9.1246,
1495
- "eval_samples_per_second": 164.61,
1496
- "eval_steps_per_second": 41.207,
1497
- "step": 2000
1498
- },
1499
- {
1500
- "epoch": 0.5948505474992601,
1501
- "grad_norm": 73.5,
1502
- "learning_rate": 2.0280410844810428e-05,
1503
- "loss": 7.5898,
1504
- "step": 2010
1505
- },
1506
- {
1507
- "epoch": 0.5978100029594554,
1508
- "grad_norm": 29.625,
1509
- "learning_rate": 1.9488821249060297e-05,
1510
- "loss": 7.3119,
1511
- "step": 2020
1512
- },
1513
- {
1514
- "epoch": 0.6007694584196508,
1515
- "grad_norm": 185.0,
1516
- "learning_rate": 1.871131877836879e-05,
1517
- "loss": 7.3793,
1518
- "step": 2030
1519
- },
1520
- {
1521
- "epoch": 0.6037289138798461,
1522
- "grad_norm": 416.0,
1523
- "learning_rate": 1.7948039473155554e-05,
1524
- "loss": 7.2483,
1525
- "step": 2040
1526
- },
1527
- {
1528
- "epoch": 0.6066883693400414,
1529
- "grad_norm": 2400.0,
1530
- "learning_rate": 1.7199116885197995e-05,
1531
- "loss": 7.5825,
1532
- "step": 2050
1533
- },
1534
- {
1535
- "epoch": 0.6096478248002367,
1536
- "grad_norm": 138.0,
1537
- "learning_rate": 1.646468205426377e-05,
1538
- "loss": 7.5139,
1539
- "step": 2060
1540
- },
1541
- {
1542
- "epoch": 0.6126072802604321,
1543
- "grad_norm": 1120.0,
1544
- "learning_rate": 1.5744863485182537e-05,
1545
- "loss": 7.4844,
1546
- "step": 2070
1547
- },
1548
- {
1549
- "epoch": 0.6155667357206274,
1550
- "grad_norm": 114.0,
1551
- "learning_rate": 1.5039787125361326e-05,
1552
- "loss": 7.3547,
1553
- "step": 2080
1554
- },
1555
- {
1556
- "epoch": 0.6185261911808227,
1557
- "grad_norm": 126.5,
1558
- "learning_rate": 1.4349576342747462e-05,
1559
- "loss": 7.3867,
1560
- "step": 2090
1561
- },
1562
- {
1563
- "epoch": 0.621485646641018,
1564
- "grad_norm": 78.0,
1565
- "learning_rate": 1.3674351904242611e-05,
1566
- "loss": 7.194,
1567
- "step": 2100
1568
- },
1569
- {
1570
- "epoch": 0.6244451021012134,
1571
- "grad_norm": 77.5,
1572
- "learning_rate": 1.3014231954572287e-05,
1573
- "loss": 7.3503,
1574
- "step": 2110
1575
- },
1576
- {
1577
- "epoch": 0.6274045575614087,
1578
- "grad_norm": 230.0,
1579
- "learning_rate": 1.2369331995613665e-05,
1580
- "loss": 7.3932,
1581
- "step": 2120
1582
- },
1583
- {
1584
- "epoch": 0.630364013021604,
1585
- "grad_norm": 61.0,
1586
- "learning_rate": 1.173976486618631e-05,
1587
- "loss": 7.4046,
1588
- "step": 2130
1589
- },
1590
- {
1591
- "epoch": 0.6333234684817993,
1592
- "grad_norm": 113.5,
1593
- "learning_rate": 1.1125640722308628e-05,
1594
- "loss": 7.4239,
1595
- "step": 2140
1596
- },
1597
- {
1598
- "epoch": 0.6362829239419947,
1599
- "grad_norm": 192.0,
1600
- "learning_rate": 1.0527067017923654e-05,
1601
- "loss": 7.8214,
1602
- "step": 2150
1603
- },
1604
- {
1605
- "epoch": 0.63924237940219,
1606
- "grad_norm": 122.0,
1607
- "learning_rate": 9.944148486097793e-06,
1608
- "loss": 7.4993,
1609
- "step": 2160
1610
- },
1611
- {
1612
- "epoch": 0.6422018348623854,
1613
- "grad_norm": 87.5,
1614
- "learning_rate": 9.376987120695545e-06,
1615
- "loss": 7.2951,
1616
- "step": 2170
1617
- },
1618
- {
1619
- "epoch": 0.6451612903225806,
1620
- "grad_norm": 44.75,
1621
- "learning_rate": 8.825682158533554e-06,
1622
- "loss": 7.2342,
1623
- "step": 2180
1624
- },
1625
- {
1626
- "epoch": 0.648120745782776,
1627
- "grad_norm": 298.0,
1628
- "learning_rate": 8.290330062017016e-06,
1629
- "loss": 7.3409,
1630
- "step": 2190
1631
- },
1632
- {
1633
- "epoch": 0.6510802012429713,
1634
- "grad_norm": 133.0,
1635
- "learning_rate": 7.771024502261526e-06,
1636
- "loss": 7.1213,
1637
- "step": 2200
1638
- },
1639
- {
1640
- "epoch": 0.6510802012429713,
1641
- "eval_loss": 7.390503406524658,
1642
- "eval_runtime": 8.9829,
1643
- "eval_samples_per_second": 167.207,
1644
- "eval_steps_per_second": 41.857,
1645
- "step": 2200
1646
- },
1647
- {
1648
- "epoch": 0.6540396567031667,
1649
- "grad_norm": 94.0,
1650
- "learning_rate": 7.267856342703461e-06,
1651
- "loss": 7.5047,
1652
- "step": 2210
1653
- },
1654
- {
1655
- "epoch": 0.6569991121633619,
1656
- "grad_norm": 320.0,
1657
- "learning_rate": 6.780913623201346e-06,
1658
- "loss": 7.2716,
1659
- "step": 2220
1660
- },
1661
- {
1662
- "epoch": 0.6599585676235573,
1663
- "grad_norm": 21.0,
1664
- "learning_rate": 6.310281544631546e-06,
1665
- "loss": 7.4708,
1666
- "step": 2230
1667
- },
1668
- {
1669
- "epoch": 0.6629180230837526,
1670
- "grad_norm": 233.0,
1671
- "learning_rate": 5.856042453980526e-06,
1672
- "loss": 7.4026,
1673
- "step": 2240
1674
- },
1675
- {
1676
- "epoch": 0.665877478543948,
1677
- "grad_norm": 170.0,
1678
- "learning_rate": 5.418275829936537e-06,
1679
- "loss": 7.0923,
1680
- "step": 2250
1681
- },
1682
- {
1683
- "epoch": 0.6688369340041432,
1684
- "grad_norm": 57.75,
1685
- "learning_rate": 4.997058268983135e-06,
1686
- "loss": 7.6658,
1687
- "step": 2260
1688
- },
1689
- {
1690
- "epoch": 0.6717963894643386,
1691
- "grad_norm": 264.0,
1692
- "learning_rate": 4.592463471997022e-06,
1693
- "loss": 7.3902,
1694
- "step": 2270
1695
- },
1696
- {
1697
- "epoch": 0.6747558449245339,
1698
- "grad_norm": 101.0,
1699
- "learning_rate": 4.204562231352516e-06,
1700
- "loss": 7.3701,
1701
- "step": 2280
1702
- },
1703
- {
1704
- "epoch": 0.6777153003847292,
1705
- "grad_norm": 38.25,
1706
- "learning_rate": 3.83342241853496e-06,
1707
- "loss": 7.2345,
1708
- "step": 2290
1709
- },
1710
- {
1711
- "epoch": 0.6806747558449245,
1712
- "grad_norm": 84.0,
1713
- "learning_rate": 3.4791089722651436e-06,
1714
- "loss": 7.3887,
1715
- "step": 2300
1716
- },
1717
- {
1718
- "epoch": 0.6836342113051198,
1719
- "grad_norm": 199.0,
1720
- "learning_rate": 3.1416838871368924e-06,
1721
- "loss": 7.2888,
1722
- "step": 2310
1723
- },
1724
- {
1725
- "epoch": 0.6865936667653152,
1726
- "grad_norm": 34.75,
1727
- "learning_rate": 2.821206202769899e-06,
1728
- "loss": 7.4502,
1729
- "step": 2320
1730
- },
1731
- {
1732
- "epoch": 0.6895531222255105,
1733
- "grad_norm": 121.5,
1734
- "learning_rate": 2.5177319934794e-06,
1735
- "loss": 7.3453,
1736
- "step": 2330
1737
- },
1738
- {
1739
- "epoch": 0.6925125776857058,
1740
- "grad_norm": 398.0,
1741
- "learning_rate": 2.2313143584648423e-06,
1742
- "loss": 7.489,
1743
- "step": 2340
1744
- },
1745
- {
1746
- "epoch": 0.6954720331459011,
1747
- "grad_norm": 840.0,
1748
- "learning_rate": 1.9620034125190644e-06,
1749
- "loss": 7.2602,
1750
- "step": 2350
1751
- },
1752
- {
1753
- "epoch": 0.6984314886060965,
1754
- "grad_norm": 47.0,
1755
- "learning_rate": 1.7098462772596302e-06,
1756
- "loss": 7.4021,
1757
- "step": 2360
1758
- },
1759
- {
1760
- "epoch": 0.7013909440662918,
1761
- "grad_norm": 32.5,
1762
- "learning_rate": 1.4748870728839347e-06,
1763
- "loss": 7.2297,
1764
- "step": 2370
1765
- },
1766
- {
1767
- "epoch": 0.7043503995264871,
1768
- "grad_norm": 25.375,
1769
- "learning_rate": 1.2571669104494256e-06,
1770
- "loss": 7.3295,
1771
- "step": 2380
1772
- },
1773
- {
1774
- "epoch": 0.7073098549866824,
1775
- "grad_norm": 290.0,
1776
- "learning_rate": 1.0567238846803996e-06,
1777
- "loss": 7.3048,
1778
- "step": 2390
1779
- },
1780
- {
1781
- "epoch": 0.7102693104468778,
1782
- "grad_norm": 74.0,
1783
- "learning_rate": 8.735930673024806e-07,
1784
- "loss": 7.2075,
1785
- "step": 2400
1786
- },
1787
- {
1788
- "epoch": 0.7102693104468778,
1789
- "eval_loss": 7.393959045410156,
1790
- "eval_runtime": 9.0065,
1791
- "eval_samples_per_second": 166.769,
1792
- "eval_steps_per_second": 41.748,
1793
- "step": 2400
1794
  }
1795
  ],
1796
  "logging_steps": 10,
@@ -1810,7 +330,7 @@
1810
  "attributes": {}
1811
  }
1812
  },
1813
- "total_flos": 5276812456230912.0,
1814
  "train_batch_size": 4,
1815
  "trial_name": null,
1816
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.11837821840781296,
5
  "eval_steps": 200,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0002959455460195324,
13
+ "eval_loss": 10.377325057983398,
14
+ "eval_runtime": 11.0363,
15
+ "eval_samples_per_second": 136.096,
16
+ "eval_steps_per_second": 34.069,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.002959455460195324,
21
+ "grad_norm": 0.43359375,
22
  "learning_rate": 1.6000000000000003e-05,
23
+ "loss": 10.3751,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.005918910920390648,
28
+ "grad_norm": 0.51171875,
29
  "learning_rate": 3.2000000000000005e-05,
30
+ "loss": 10.3738,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 0.008878366380585973,
35
+ "grad_norm": 0.640625,
36
  "learning_rate": 4.8e-05,
37
+ "loss": 10.3766,
38
  "step": 30
39
  },
40
  {
41
  "epoch": 0.011837821840781295,
42
+ "grad_norm": 0.8359375,
43
  "learning_rate": 6.400000000000001e-05,
44
+ "loss": 10.3761,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 0.01479727730097662,
49
+ "grad_norm": 1.5859375,
50
  "learning_rate": 8e-05,
51
+ "loss": 10.369,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 0.017756732761171946,
56
+ "grad_norm": 0.462890625,
57
  "learning_rate": 9.6e-05,
58
+ "loss": 10.3704,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.020716188221367268,
63
+ "grad_norm": 0.6015625,
64
  "learning_rate": 0.00011200000000000001,
65
+ "loss": 10.362,
66
  "step": 70
67
  },
68
  {
69
  "epoch": 0.02367564368156259,
70
+ "grad_norm": 0.76171875,
71
  "learning_rate": 0.00012800000000000002,
72
+ "loss": 10.3539,
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.026635099141757917,
77
+ "grad_norm": 1.1484375,
78
  "learning_rate": 0.000144,
79
+ "loss": 10.3111,
80
  "step": 90
81
  },
82
  {
83
  "epoch": 0.02959455460195324,
84
+ "grad_norm": 2.53125,
85
  "learning_rate": 0.00016,
86
+ "loss": 10.2556,
87
  "step": 100
88
  },
89
  {
90
  "epoch": 0.032554010062148565,
91
+ "grad_norm": 0.7578125,
92
  "learning_rate": 0.00017600000000000002,
93
+ "loss": 10.1628,
94
  "step": 110
95
  },
96
  {
97
  "epoch": 0.03551346552234389,
98
+ "grad_norm": 0.66796875,
99
  "learning_rate": 0.000192,
100
+ "loss": 10.0397,
101
  "step": 120
102
  },
103
  {
104
  "epoch": 0.03847292098253921,
105
+ "grad_norm": 0.7421875,
106
  "learning_rate": 0.0001999978128380225,
107
+ "loss": 9.9248,
108
  "step": 130
109
  },
110
  {
111
  "epoch": 0.041432376442734536,
112
+ "grad_norm": 0.88671875,
113
  "learning_rate": 0.0001999803161162393,
114
+ "loss": 9.8201,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.04439183190292986,
119
+ "grad_norm": 1.78125,
120
  "learning_rate": 0.00019994532573409262,
121
+ "loss": 9.7523,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 0.04735128736312518,
126
+ "grad_norm": 0.609375,
127
  "learning_rate": 0.00019989284781388617,
128
+ "loss": 9.6134,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 0.05031074282332051,
133
+ "grad_norm": 0.57421875,
134
  "learning_rate": 0.00019982289153773646,
135
+ "loss": 9.5326,
136
  "step": 170
137
  },
138
  {
139
  "epoch": 0.053270198283515834,
140
+ "grad_norm": 0.6640625,
141
  "learning_rate": 0.00019973546914596623,
142
+ "loss": 9.4273,
143
  "step": 180
144
  },
145
  {
146
  "epoch": 0.05622965374371116,
147
+ "grad_norm": 0.90234375,
148
  "learning_rate": 0.00019963059593496268,
149
+ "loss": 9.3176,
150
  "step": 190
151
  },
152
  {
153
  "epoch": 0.05918910920390648,
154
+ "grad_norm": 1.6015625,
155
  "learning_rate": 0.00019950829025450114,
156
+ "loss": 9.2412,
157
  "step": 200
158
  },
159
  {
160
  "epoch": 0.05918910920390648,
161
+ "eval_loss": 9.224679946899414,
162
+ "eval_runtime": 11.174,
163
+ "eval_samples_per_second": 134.419,
164
+ "eval_steps_per_second": 33.649,
165
  "step": 200
166
  },
167
  {
168
  "epoch": 0.062148564664101805,
169
+ "grad_norm": 0.640625,
170
  "learning_rate": 0.0001993685735045343,
171
+ "loss": 9.1904,
172
  "step": 210
173
  },
174
  {
175
  "epoch": 0.06510802012429713,
176
+ "grad_norm": 0.71484375,
177
  "learning_rate": 0.0001992114701314478,
178
+ "loss": 9.1332,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.06806747558449246,
183
+ "grad_norm": 0.703125,
184
  "learning_rate": 0.000199037007623783,
185
+ "loss": 9.0845,
186
  "step": 230
187
  },
188
  {
189
  "epoch": 0.07102693104468778,
190
+ "grad_norm": 0.83984375,
191
  "learning_rate": 0.00019884521650742715,
192
+ "loss": 8.992,
193
  "step": 240
194
  },
195
  {
196
  "epoch": 0.0739863865048831,
197
+ "grad_norm": 2.59375,
198
  "learning_rate": 0.00019863613034027224,
199
+ "loss": 9.0056,
200
  "step": 250
201
  },
202
  {
203
  "epoch": 0.07694584196507842,
204
+ "grad_norm": 0.6953125,
205
  "learning_rate": 0.0001984097857063434,
206
+ "loss": 8.8204,
207
  "step": 260
208
  },
209
  {
210
  "epoch": 0.07990529742527375,
211
+ "grad_norm": 0.77734375,
212
  "learning_rate": 0.0001981662222093976,
213
+ "loss": 8.8472,
214
  "step": 270
215
  },
216
  {
217
  "epoch": 0.08286475288546907,
218
+ "grad_norm": 0.70703125,
219
  "learning_rate": 0.00019790548246599447,
220
+ "loss": 8.785,
221
  "step": 280
222
  },
223
  {
224
  "epoch": 0.0858242083456644,
225
+ "grad_norm": 0.8515625,
226
  "learning_rate": 0.00019762761209803927,
227
+ "loss": 8.7212,
228
  "step": 290
229
  },
230
  {
231
  "epoch": 0.08878366380585972,
232
+ "grad_norm": 1.6484375,
233
  "learning_rate": 0.0001973326597248006,
234
+ "loss": 8.748,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 0.09174311926605505,
239
+ "grad_norm": 0.609375,
240
  "learning_rate": 0.00019702067695440332,
241
+ "loss": 8.6333,
242
  "step": 310
243
  },
244
  {
245
  "epoch": 0.09470257472625036,
246
+ "grad_norm": 0.6171875,
247
  "learning_rate": 0.00019669171837479873,
248
+ "loss": 8.6464,
249
  "step": 320
250
  },
251
  {
252
  "epoch": 0.09766203018644569,
253
+ "grad_norm": 0.640625,
254
  "learning_rate": 0.00019634584154421317,
255
+ "loss": 8.6169,
256
  "step": 330
257
  },
258
  {
259
  "epoch": 0.10062148564664102,
260
+ "grad_norm": 0.79296875,
261
  "learning_rate": 0.00019598310698107702,
262
+ "loss": 8.541,
263
  "step": 340
264
  },
265
  {
266
  "epoch": 0.10358094110683634,
267
+ "grad_norm": 1.8203125,
268
  "learning_rate": 0.00019560357815343577,
269
+ "loss": 8.4839,
270
  "step": 350
271
  },
272
  {
273
  "epoch": 0.10654039656703167,
274
+ "grad_norm": 0.77734375,
275
  "learning_rate": 0.00019520732146784491,
276
+ "loss": 8.5185,
277
  "step": 360
278
  },
279
  {
280
  "epoch": 0.109499852027227,
281
+ "grad_norm": 1.0703125,
282
  "learning_rate": 0.0001947944062577507,
283
+ "loss": 8.4832,
284
  "step": 370
285
  },
286
  {
287
  "epoch": 0.11245930748742232,
288
+ "grad_norm": 0.734375,
289
  "learning_rate": 0.00019436490477135878,
290
+ "loss": 8.4174,
291
  "step": 380
292
  },
293
  {
294
  "epoch": 0.11541876294761765,
295
+ "grad_norm": 0.84375,
296
  "learning_rate": 0.00019391889215899299,
297
+ "loss": 8.477,
298
  "step": 390
299
  },
300
  {
301
  "epoch": 0.11837821840781296,
302
+ "grad_norm": 1.9921875,
303
  "learning_rate": 0.0001934564464599461,
304
+ "loss": 8.2947,
305
  "step": 400
306
  },
307
  {
308
  "epoch": 0.11837821840781296,
309
+ "eval_loss": 8.401090621948242,
310
+ "eval_runtime": 12.2623,
311
+ "eval_samples_per_second": 122.489,
312
+ "eval_steps_per_second": 30.663,
313
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  }
315
  ],
316
  "logging_steps": 10,
 
330
  "attributes": {}
331
  }
332
  },
333
+ "total_flos": 20596742160384.0,
334
  "train_batch_size": 4,
335
  "trial_name": null,
336
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cb39bdcd9027f9839c24593aa51e9c2c6db52de8ff1d60ef6eace2d38a2b7bf
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ffd459aa83253f73e371e129574dd0434ee79c2c18eb103b0e4428a34062eb2
3
  size 6840