k1h0 commited on Apr 29

Commit

0208592

verified ·

1 Parent(s): c117833

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

README.md +60 -0
added_tokens.json +44 -0
all_results.json +9 -0
config.json +36 -0
generation_config.json +6 -0
llamaboard_config.yaml +77 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +298 -0
special_tokens_map.json +34 -0
tokenization_inflm.py +292 -0
tokenizer.model +3 -0
tokenizer_config.json +396 -0
train_results.json +9 -0
trainer_log.jsonl +114 -0
trainer_state.json +947 -0
training_args.bin +3 -0
training_args.yaml +39 -0
training_loss.png +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+library_name: transformers
+license: other
+base_model: infly/OpenCoder-8B-Instruct
+tags:
+- llama-factory
+- freeze
+- generated_from_trainer
+model-index:
+- name: opencoder_nsx
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# opencoder_nsx
+This model is a fine-tuned version of [infly/OpenCoder-8B-Instruct](https://huggingface.co/infly/OpenCoder-8B-Instruct) on the codes_330k_nsx dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 16
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 512
+- total_eval_batch_size: 32
+- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- num_epochs: 1.0
+### Training results
+### Framework versions
+- Transformers 4.48.2
+- Pytorch 2.5.1+cu124
+- Datasets 3.2.0
+- Tokenizers 0.21.0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "<code_to_intermediate>": 96521,
+  "<empty_output>": 96520,
+  "<file_sep>": 96511,
+  "<fim_middle>": 96508,
+  "<fim_prefix>": 96507,
+  "<fim_suffix>": 96509,
+  "<intermediate_to_code>": 96522,
+  "<issue_closed>": 96514,
+  "<issue_comment>": 96513,
+  "<issue_start>": 96512,
+  "<jupyter_code>": 96517,
+  "<jupyter_output>": 96518,
+  "<jupyter_script>": 96519,
+  "<jupyter_start>": 96515,
+  "<jupyter_text>": 96516,
+  "<pr>": 96523,
+  "<pr_base>": 96526,
+  "<pr_base_code>": 96528,
+  "<pr_comment>": 96531,
+  "<pr_diff>": 96529,
+  "<pr_diff_hunk>": 96530,
+  "<pr_diff_hunk_comment_line>": 96538,
+  "<pr_event_id>": 96532,
+  "<pr_file>": 96527,
+  "<pr_in_reply_to_comment_id>": 96537,
+  "<pr_in_reply_to_review_id>": 96536,
+  "<pr_is_merged>": 96525,
+  "<pr_review>": 96533,
+  "<pr_review_comment>": 96535,
+  "<pr_review_state>": 96534,
+  "<pr_status>": 96524,
+  "<repo_name>": 96510,
+  "<|endoftext|>": 96506,
+  "<|end|>": 96500,
+  "<|im_end|>": 96539,
+  "<|im_start|>": 96540,
+  "<|message|>": 96501,
+  "<|pad|>": 96505,
+  "<|start|>": 96499,
+  "<|tool_end|>": 96504,
+  "<|tool_excute|>": 96503,
+  "<|tool_start|>": 96502
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9977924944812362,
+    "num_input_tokens_seen": 236978176,
+    "total_flos": 1.0486889344470614e+19,
+    "train_loss": 0.7524756815581195,
+    "train_runtime": 19134.3763,
+    "train_samples_per_second": 3.027,
+    "train_steps_per_second": 0.006
+}

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_name_or_path": "infly/OpenCoder-8B-Instruct",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 96540,
+  "eos_token_id": 96539,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 1.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.2",
+  "use_cache": false,
+  "vocab_size": 96640
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 96540,
+  "eos_token_id": 96539,
+  "transformers_version": "4.48.2"
+}

llamaboard_config.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+top.booster: liger_kernel
+top.checkpoint_path: null
+top.finetuning_type: freeze
+top.model_name: OpenCoder-8B-Instruct
+top.quantization_bit: none
+top.quantization_method: bitsandbytes
+top.rope_scaling: llama3
+top.template: opencoder
+train.additional_target: ''
+train.apollo_rank: 256
+train.apollo_scale: 1
+train.apollo_target: all
+train.apollo_update_interval: 200
+train.badam_mode: layer
+train.badam_switch_interval: 50
+train.badam_switch_mode: ascending
+train.badam_update_ratio: 0.05
+train.batch_size: 16
+train.compute_type: bf16
+train.create_new_adapter: false
+train.cutoff_len: 4096
+train.dataset:
+- codes_330k_nsx
+train.dataset_dir: data
+train.ds_offload: false
+train.ds_stage: none
+train.extra_args: '{}'
+train.freeze_extra_modules: ''
+train.freeze_trainable_layers: 2
+train.freeze_trainable_modules: all
+train.galore_rank: 16
+train.galore_scale: 2
+train.galore_target: all
+train.galore_update_interval: 200
+train.gradient_accumulation_steps: 8
+train.learning_rate: 5e-5
+train.logging_steps: 1
+train.lora_alpha: 16
+train.lora_dropout: 0
+train.lora_rank: 8
+train.lora_target: ''
+train.loraplus_lr_ratio: 0
+train.lr_scheduler_type: cosine
+train.mask_history: false
+train.max_grad_norm: '1.0'
+train.max_samples: '50000000'
+train.neat_packing: true
+train.neftune_alpha: 0
+train.num_train_epochs: '1'
+train.packing: true
+train.ppo_score_norm: false
+train.ppo_whiten_rewards: false
+train.pref_beta: 0.1
+train.pref_ftx: 0
+train.pref_loss: sigmoid
+train.report_to:
+- none
+train.resize_vocab: false
+train.reward_model: null
+train.save_steps: 500
+train.swanlab_api_key: ''
+train.swanlab_mode: cloud
+train.swanlab_project: llamafactory
+train.swanlab_run_name: ''
+train.swanlab_workspace: ''
+train.train_on_prompt: false
+train.training_stage: Supervised Fine-Tuning
+train.use_apollo: true
+train.use_badam: false
+train.use_dora: false
+train.use_galore: false
+train.use_llama_pro: true
+train.use_pissa: false
+train.use_rslora: false
+train.use_swanlab: false
+train.val_size: 0
+train.warmup_steps: 0

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:182be20097743536eb8333475d94fdb0eeea45e2d7f20103b65a61c2771ee9bf
+size 4919027568

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd94e601a2ea58138028ded43fe07b1345edc5df4b3b30449de0d09bceee1231
+size 4915915128

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ed95d0f7caa2694d620d6e75edb8341e121d82b09c4688cc831b67af335c19d
+size 4999819112

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fd964e5239ecdc56be2c5745bfff1e4fc8415c9d83f6faa3015a7b54b333926
+size 1580246000

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,298 @@

+{
+  "metadata": {
+    "total_size": 16414973952
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_start|>"
+  ],
+  "bos_token": {
+    "content": "<|im_start|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_inflm.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for INFLMTokenizer."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+from tokenizers import pre_tokenizers,Regex,decoders
+from tokenizers.pre_tokenizers import Digits, Split, ByteLevel
+import os
+# same as gpt4 cl-base-100k
+PATTERN = Regex("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+\s+(\S)+")
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+PRETRAINED_VOCAB_FILES_MAP = {}
+class INFLMTokenizer(PreTrainedTokenizer):
+    """
+    Construct a INFLMTokenizer tokenizer based on sentence-piece
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+    _auto_class = "AutoTokenizer"
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=False,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        spaces_between_special_tokens=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self._no_prefix_space_tokens = None
+        self.pre_tokenizer = pre_tokenizers.Sequence([Split(pattern =PATTERN,behavior = "isolated", invert = False)])
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+        """ Initialisation"""
+    @property
+    def no_prefix_space_tokens(self):
+        if self._no_prefix_space_tokens is None:
+            vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+            self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+        return self._no_prefix_space_tokens
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.sp_model.eos_id()
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        splits = self.pre_tokenizer.pre_tokenize_str(text)
+        texts=[]
+        for split in splits:
+            texts.extend(self.sp_model.encode(split[0], out_type=str))
+        return texts
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def _maybe_add_prefix_space(self, tokens, decoded):
+        if tokens and tokens[0] not in self.no_prefix_space_tokens:
+            return " " + decoded
+        else:
+            return decoded
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+        output = bos_token_ids + token_ids_0
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+        return output
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return  ([0] * len(token_ids_0)) + eos_token_id
+        return  ([0] * len(token_ids_0)) + eos_token_id + ([0] * len(token_ids_1)) + eos_token_id
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Note this is only used for back compatiblity, thus list of zero is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+    @property
+    def default_chat_template(self):
+        return None
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        # default spaces_between_special_tokens should be false.
+        if spaces_between_special_tokens:
+            logger.warning_once('spaces_between_special_tokens is set. \
+                                It has no effect for bos,eos,pad,unk when transformers<=4.38.')
+        return super().decode(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76d43d618fc0c5a7c79dc4e72579f9f29bb803b36e4a4d709d1233626fd8fe2a
+size 1535725

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,396 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96499": {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96500": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96501": {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96502": {
+      "content": "<|tool_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96503": {
+      "content": "<|tool_excute|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96504": {
+      "content": "<|tool_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96505": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96506": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96507": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96508": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96509": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96510": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96511": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96512": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96513": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96514": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96515": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96516": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96517": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96518": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96519": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96520": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96521": {
+      "content": "<code_to_intermediate>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96522": {
+      "content": "<intermediate_to_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96523": {
+      "content": "<pr>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96524": {
+      "content": "<pr_status>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96525": {
+      "content": "<pr_is_merged>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96526": {
+      "content": "<pr_base>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96527": {
+      "content": "<pr_file>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96528": {
+      "content": "<pr_base_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96529": {
+      "content": "<pr_diff>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96530": {
+      "content": "<pr_diff_hunk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96531": {
+      "content": "<pr_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96532": {
+      "content": "<pr_event_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96533": {
+      "content": "<pr_review>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96534": {
+      "content": "<pr_review_state>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96535": {
+      "content": "<pr_review_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96536": {
+      "content": "<pr_in_reply_to_review_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96537": {
+      "content": "<pr_in_reply_to_comment_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96538": {
+      "content": "<pr_diff_hunk_comment_line>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96539": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96540": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_start|>"
+  ],
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_inflm.INFLMTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|im_start|>",
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are OpenCoder, created by OpenCoder Team.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "model_max_length": 4096,
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "return_tensors": true,
+  "spaces_between_special_tokens": false,
+  "split_special_tokens": false,
+  "tokenizer_class": "INFLMTokenizer",
+  "unk_token": "<unk>"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9977924944812362,
+    "num_input_tokens_seen": 236978176,
+    "total_flos": 1.0486889344470614e+19,
+    "train_loss": 0.7524756815581195,
+    "train_runtime": 19134.3763,
+    "train_samples_per_second": 3.027,
+    "train_steps_per_second": 0.006
+}

trainer_log.jsonl ADDED Viewed

	@@ -0,0 +1,114 @@

+{"current_steps": 1, "total_steps": 113, "loss": 0.9875, "lr": 4.999033893736386e-05, "epoch": 0.008830022075055188, "percentage": 0.88, "elapsed_time": "0:02:59", "remaining_time": "5:34:56", "throughput": 11687.78, "total_tokens": 2097152}
+{"current_steps": 2, "total_steps": 113, "loss": 0.944, "lr": 4.99613632163459e-05, "epoch": 0.017660044150110375, "percentage": 1.77, "elapsed_time": "0:05:53", "remaining_time": "5:27:13", "throughput": 11856.46, "total_tokens": 4194304}
+{"current_steps": 3, "total_steps": 113, "loss": 0.904, "lr": 4.991309523184661e-05, "epoch": 0.026490066225165563, "percentage": 2.65, "elapsed_time": "0:08:43", "remaining_time": "5:20:09", "throughput": 12009.19, "total_tokens": 6291456}
+{"current_steps": 4, "total_steps": 113, "loss": 0.8913, "lr": 4.98455722894677e-05, "epoch": 0.03532008830022075, "percentage": 3.54, "elapsed_time": "0:11:34", "remaining_time": "5:15:26", "throughput": 12077.74, "total_tokens": 8388608}
+{"current_steps": 5, "total_steps": 113, "loss": 0.8511, "lr": 4.975884657667922e-05, "epoch": 0.04415011037527594, "percentage": 4.42, "elapsed_time": "0:14:24", "remaining_time": "5:11:10", "throughput": 12131.05, "total_tokens": 10485760}
+{"current_steps": 6, "total_steps": 113, "loss": 0.8311, "lr": 4.965298512248466e-05, "epoch": 0.052980132450331126, "percentage": 5.31, "elapsed_time": "0:17:14", "remaining_time": "5:07:33", "throughput": 12160.16, "total_tokens": 12582912}
+{"current_steps": 7, "total_steps": 113, "loss": 0.838, "lr": 4.952806974561518e-05, "epoch": 0.06181015452538632, "percentage": 6.19, "elapsed_time": "0:20:05", "remaining_time": "5:04:07", "throughput": 12182.29, "total_tokens": 14680064}
+{"current_steps": 8, "total_steps": 113, "loss": 0.8468, "lr": 4.9384196991293205e-05, "epoch": 0.0706401766004415, "percentage": 7.08, "elapsed_time": "0:22:54", "remaining_time": "5:00:41", "throughput": 12205.11, "total_tokens": 16777216}
+{"current_steps": 9, "total_steps": 113, "loss": 0.7884, "lr": 4.922147805661402e-05, "epoch": 0.07947019867549669, "percentage": 7.96, "elapsed_time": "0:25:44", "remaining_time": "4:57:22", "throughput": 12224.04, "total_tokens": 18874368}
+{"current_steps": 10, "total_steps": 113, "loss": 0.8009, "lr": 4.904003870460323e-05, "epoch": 0.08830022075055188, "percentage": 8.85, "elapsed_time": "0:28:34", "remaining_time": "4:54:16", "throughput": 12233.96, "total_tokens": 20971520}
+{"current_steps": 11, "total_steps": 113, "loss": 0.7764, "lr": 4.884001916701639e-05, "epoch": 0.09713024282560706, "percentage": 9.73, "elapsed_time": "0:31:23", "remaining_time": "4:51:08", "throughput": 12245.4, "total_tokens": 23068672}
+{"current_steps": 12, "total_steps": 113, "loss": 0.7774, "lr": 4.862157403595598e-05, "epoch": 0.10596026490066225, "percentage": 10.62, "elapsed_time": "0:34:14", "remaining_time": "4:48:13", "throughput": 12248.38, "total_tokens": 25165824}
+{"current_steps": 13, "total_steps": 113, "loss": 0.7808, "lr": 4.838487214438951e-05, "epoch": 0.11479028697571744, "percentage": 11.5, "elapsed_time": "0:37:04", "remaining_time": "4:45:15", "throughput": 12253.12, "total_tokens": 27262976}
+{"current_steps": 14, "total_steps": 113, "loss": 0.8111, "lr": 4.813009643566101e-05, "epoch": 0.12362030905077263, "percentage": 12.39, "elapsed_time": "0:39:54", "remaining_time": "4:42:15", "throughput": 12259.37, "total_tokens": 29360128}
+{"current_steps": 15, "total_steps": 113, "loss": 0.7997, "lr": 4.7857443822096905e-05, "epoch": 0.13245033112582782, "percentage": 13.27, "elapsed_time": "0:42:45", "remaining_time": "4:39:21", "throughput": 12261.52, "total_tokens": 31457280}
+{"current_steps": 16, "total_steps": 113, "loss": 0.7537, "lr": 4.7567125032815394e-05, "epoch": 0.141280353200883, "percentage": 14.16, "elapsed_time": "0:45:35", "remaining_time": "4:36:21", "throughput": 12267.81, "total_tokens": 33554432}
+{"current_steps": 17, "total_steps": 113, "loss": 0.7645, "lr": 4.7259364450857096e-05, "epoch": 0.15011037527593818, "percentage": 15.04, "elapsed_time": "0:48:25", "remaining_time": "4:33:27", "throughput": 12270.65, "total_tokens": 35651584}
+{"current_steps": 18, "total_steps": 113, "loss": 0.7712, "lr": 4.6934399939762746e-05, "epoch": 0.15894039735099338, "percentage": 15.93, "elapsed_time": "0:51:15", "remaining_time": "4:30:33", "throughput": 12272.81, "total_tokens": 37748736}
+{"current_steps": 19, "total_steps": 113, "loss": 0.74, "lr": 4.659248265973205e-05, "epoch": 0.16777041942604856, "percentage": 16.81, "elapsed_time": "0:54:05", "remaining_time": "4:27:38", "throughput": 12275.77, "total_tokens": 39845888}
+{"current_steps": 20, "total_steps": 113, "loss": 0.777, "lr": 4.6233876873505694e-05, "epoch": 0.17660044150110377, "percentage": 17.7, "elapsed_time": "0:56:55", "remaining_time": "4:24:43", "throughput": 12279.48, "total_tokens": 41943040}
+{"current_steps": 21, "total_steps": 113, "loss": 0.7537, "lr": 4.585885974212068e-05, "epoch": 0.18543046357615894, "percentage": 18.58, "elapsed_time": "0:59:44", "remaining_time": "4:21:45", "throughput": 12284.78, "total_tokens": 44040192}
+{"current_steps": 22, "total_steps": 113, "loss": 0.7453, "lr": 4.5467721110696685e-05, "epoch": 0.19426048565121412, "percentage": 19.47, "elapsed_time": "1:02:34", "remaining_time": "4:18:50", "throughput": 12288.21, "total_tokens": 46137344}
+{"current_steps": 23, "total_steps": 113, "loss": 0.7573, "lr": 4.5060763284419114e-05, "epoch": 0.20309050772626933, "percentage": 20.35, "elapsed_time": "1:05:24", "remaining_time": "4:15:56", "throughput": 12290.92, "total_tokens": 48234496}
+{"current_steps": 24, "total_steps": 113, "loss": 0.7626, "lr": 4.463830079489196e-05, "epoch": 0.2119205298013245, "percentage": 21.24, "elapsed_time": "1:08:13", "remaining_time": "4:13:01", "throughput": 12294.4, "total_tokens": 50331648}
+{"current_steps": 25, "total_steps": 113, "loss": 0.7558, "lr": 4.420066015704105e-05, "epoch": 0.22075055187637968, "percentage": 22.12, "elapsed_time": "1:11:03", "remaining_time": "4:10:08", "throughput": 12296.47, "total_tokens": 52428800}
+{"current_steps": 26, "total_steps": 113, "loss": 0.7654, "lr": 4.374817961675553e-05, "epoch": 0.22958057395143489, "percentage": 23.01, "elapsed_time": "1:13:53", "remaining_time": "4:07:15", "throughput": 12298.04, "total_tokens": 54525952}
+{"current_steps": 27, "total_steps": 113, "loss": 0.7363, "lr": 4.3281208889462715e-05, "epoch": 0.23841059602649006, "percentage": 23.89, "elapsed_time": "1:16:43", "remaining_time": "4:04:22", "throughput": 12300.77, "total_tokens": 56623104}
+{"current_steps": 28, "total_steps": 113, "loss": 0.7503, "lr": 4.2800108889838244e-05, "epoch": 0.24724061810154527, "percentage": 24.78, "elapsed_time": "1:19:33", "remaining_time": "4:01:29", "throughput": 12302.45, "total_tokens": 58720256}
+{"current_steps": 29, "total_steps": 113, "loss": 0.7474, "lr": 4.230525145286057e-05, "epoch": 0.2560706401766004, "percentage": 25.66, "elapsed_time": "1:22:22", "remaining_time": "3:58:35", "throughput": 12305.37, "total_tokens": 60817408}
+{"current_steps": 30, "total_steps": 113, "loss": 0.7442, "lr": 4.1797019046425264e-05, "epoch": 0.26490066225165565, "percentage": 26.55, "elapsed_time": "1:25:12", "remaining_time": "3:55:44", "throughput": 12306.4, "total_tokens": 62914560}
+{"current_steps": 31, "total_steps": 113, "loss": 0.7492, "lr": 4.127580447574131e-05, "epoch": 0.2737306843267108, "percentage": 27.43, "elapsed_time": "1:28:01", "remaining_time": "3:52:51", "throughput": 12308.37, "total_tokens": 65011712}
+{"current_steps": 32, "total_steps": 113, "loss": 0.7601, "lr": 4.0742010579737855e-05, "epoch": 0.282560706401766, "percentage": 28.32, "elapsed_time": "1:30:51", "remaining_time": "3:49:59", "throughput": 12309.6, "total_tokens": 67108864}
+{"current_steps": 33, "total_steps": 113, "loss": 0.7381, "lr": 4.0196049919716004e-05, "epoch": 0.2913907284768212, "percentage": 29.2, "elapsed_time": "1:33:41", "remaining_time": "3:47:07", "throughput": 12311.56, "total_tokens": 69206016}
+{"current_steps": 34, "total_steps": 113, "loss": 0.7406, "lr": 3.963834446048644e-05, "epoch": 0.30022075055187636, "percentage": 30.09, "elapsed_time": "1:36:31", "remaining_time": "3:44:16", "throughput": 12312.23, "total_tokens": 71303168}
+{"current_steps": 35, "total_steps": 113, "loss": 0.7573, "lr": 3.9069325244239095e-05, "epoch": 0.3090507726269316, "percentage": 30.97, "elapsed_time": "1:39:20", "remaining_time": "3:41:23", "throughput": 12313.97, "total_tokens": 73400320}
+{"current_steps": 36, "total_steps": 113, "loss": 0.7419, "lr": 3.848943205739711e-05, "epoch": 0.31788079470198677, "percentage": 31.86, "elapsed_time": "1:42:10", "remaining_time": "3:38:32", "throughput": 12315.1, "total_tokens": 75497472}
+{"current_steps": 37, "total_steps": 113, "loss": 0.7357, "lr": 3.7899113090712526e-05, "epoch": 0.32671081677704195, "percentage": 32.74, "elapsed_time": "1:45:00", "remaining_time": "3:35:40", "throughput": 12316.24, "total_tokens": 77594624}
+{"current_steps": 38, "total_steps": 113, "loss": 0.7346, "lr": 3.729882459286632e-05, "epoch": 0.3355408388520971, "percentage": 33.63, "elapsed_time": "1:47:49", "remaining_time": "3:32:49", "throughput": 12317.13, "total_tokens": 79691776}
+{"current_steps": 39, "total_steps": 113, "loss": 0.7437, "lr": 3.66890305178407e-05, "epoch": 0.3443708609271523, "percentage": 34.51, "elapsed_time": "1:50:39", "remaining_time": "3:29:58", "throughput": 12318.14, "total_tokens": 81788928}
+{"current_steps": 40, "total_steps": 113, "loss": 0.742, "lr": 3.607020216633599e-05, "epoch": 0.35320088300220753, "percentage": 35.4, "elapsed_time": "1:53:29", "remaining_time": "3:27:08", "throughput": 12318.13, "total_tokens": 83886080}
+{"current_steps": 41, "total_steps": 113, "loss": 0.7136, "lr": 3.544281782150936e-05, "epoch": 0.3620309050772627, "percentage": 36.28, "elapsed_time": "1:56:20", "remaining_time": "3:24:17", "throughput": 12318.44, "total_tokens": 85983232}
+{"current_steps": 42, "total_steps": 113, "loss": 0.7417, "lr": 3.4807362379317025e-05, "epoch": 0.3708609271523179, "percentage": 37.17, "elapsed_time": "1:59:10", "remaining_time": "3:21:27", "throughput": 12318.8, "total_tokens": 88080384}
+{"current_steps": 43, "total_steps": 113, "loss": 0.7102, "lr": 3.416432697374533e-05, "epoch": 0.37969094922737306, "percentage": 38.05, "elapsed_time": "2:01:59", "remaining_time": "3:18:36", "throughput": 12319.37, "total_tokens": 90177536}
+{"current_steps": 44, "total_steps": 113, "loss": 0.7685, "lr": 3.3514208597220705e-05, "epoch": 0.38852097130242824, "percentage": 38.94, "elapsed_time": "2:04:49", "remaining_time": "3:15:44", "throughput": 12320.63, "total_tokens": 92274688}
+{"current_steps": 45, "total_steps": 113, "loss": 0.7332, "lr": 3.285750971649167e-05, "epoch": 0.3973509933774834, "percentage": 39.82, "elapsed_time": "2:07:40", "remaining_time": "3:12:55", "throughput": 12319.92, "total_tokens": 94371840}
+{"current_steps": 46, "total_steps": 113, "loss": 0.7387, "lr": 3.219473788427984e-05, "epoch": 0.40618101545253865, "percentage": 40.71, "elapsed_time": "2:10:30", "remaining_time": "3:10:05", "throughput": 12319.21, "total_tokens": 96468992}
+{"current_steps": 47, "total_steps": 113, "loss": 0.7096, "lr": 3.1526405346999946e-05, "epoch": 0.41501103752759383, "percentage": 41.59, "elapsed_time": "2:13:20", "remaining_time": "3:07:15", "throughput": 12319.45, "total_tokens": 98566144}
+{"current_steps": 48, "total_steps": 113, "loss": 0.7242, "lr": 3.085302864885235e-05, "epoch": 0.423841059602649, "percentage": 42.48, "elapsed_time": "2:16:10", "remaining_time": "3:04:24", "throughput": 12319.97, "total_tokens": 100663296}
+{"current_steps": 49, "total_steps": 113, "loss": 0.7338, "lr": 3.017512823259373e-05, "epoch": 0.4326710816777042, "percentage": 43.36, "elapsed_time": "2:19:00", "remaining_time": "3:01:33", "throughput": 12320.62, "total_tokens": 102760448}
+{"current_steps": 50, "total_steps": 113, "loss": 0.7494, "lr": 2.9493228037294702e-05, "epoch": 0.44150110375275936, "percentage": 44.25, "elapsed_time": "2:21:50", "remaining_time": "2:58:43", "throughput": 12321.04, "total_tokens": 104857600}
+{"current_steps": 51, "total_steps": 113, "loss": 0.7272, "lr": 2.8807855093395126e-05, "epoch": 0.4503311258278146, "percentage": 45.13, "elapsed_time": "2:24:41", "remaining_time": "2:55:53", "throughput": 12320.02, "total_tokens": 106954752}
+{"current_steps": 52, "total_steps": 113, "loss": 0.7447, "lr": 2.8119539115370218e-05, "epoch": 0.45916114790286977, "percentage": 46.02, "elapsed_time": "2:27:29", "remaining_time": "2:53:00", "throughput": 12323.37, "total_tokens": 109051904}
+{"current_steps": 53, "total_steps": 113, "loss": 0.7219, "lr": 2.742881209232215e-05, "epoch": 0.46799116997792495, "percentage": 46.9, "elapsed_time": "2:30:17", "remaining_time": "2:50:08", "throughput": 12325.84, "total_tokens": 111149056}
+{"current_steps": 54, "total_steps": 113, "loss": 0.7484, "lr": 2.6736207876813646e-05, "epoch": 0.4768211920529801, "percentage": 47.79, "elapsed_time": "2:33:05", "remaining_time": "2:47:15", "throughput": 12328.88, "total_tokens": 113246208}
+{"current_steps": 55, "total_steps": 113, "loss": 0.7352, "lr": 2.604226177226137e-05, "epoch": 0.4856512141280353, "percentage": 48.67, "elapsed_time": "2:35:54", "remaining_time": "2:44:24", "throughput": 12330.83, "total_tokens": 115343360}
+{"current_steps": 56, "total_steps": 113, "loss": 0.7254, "lr": 2.5347510119207878e-05, "epoch": 0.49448123620309054, "percentage": 49.56, "elapsed_time": "2:38:43", "remaining_time": "2:41:33", "throughput": 12331.8, "total_tokens": 117440512}
+{"current_steps": 57, "total_steps": 113, "loss": 0.724, "lr": 2.4652489880792128e-05, "epoch": 0.5033112582781457, "percentage": 50.44, "elapsed_time": "2:41:32", "remaining_time": "2:38:42", "throughput": 12333.18, "total_tokens": 119537664}
+{"current_steps": 58, "total_steps": 113, "loss": 0.747, "lr": 2.395773822773863e-05, "epoch": 0.5121412803532008, "percentage": 51.33, "elapsed_time": "2:44:19", "remaining_time": "2:35:49", "throughput": 12336.23, "total_tokens": 121634816}
+{"current_steps": 59, "total_steps": 113, "loss": 0.7246, "lr": 2.3263792123186353e-05, "epoch": 0.5209713024282561, "percentage": 52.21, "elapsed_time": "2:47:08", "remaining_time": "2:32:58", "throughput": 12338.13, "total_tokens": 123731968}
+{"current_steps": 60, "total_steps": 113, "loss": 0.7424, "lr": 2.2571187907677853e-05, "epoch": 0.5298013245033113, "percentage": 53.1, "elapsed_time": "2:49:57", "remaining_time": "2:30:07", "throughput": 12339.8, "total_tokens": 125829120}
+{"current_steps": 61, "total_steps": 113, "loss": 0.7422, "lr": 2.188046088462979e-05, "epoch": 0.5386313465783664, "percentage": 53.98, "elapsed_time": "2:52:45", "remaining_time": "2:27:16", "throughput": 12341.13, "total_tokens": 127926272}
+{"current_steps": 62, "total_steps": 113, "loss": 0.7166, "lr": 2.1192144906604876e-05, "epoch": 0.5474613686534217, "percentage": 54.87, "elapsed_time": "2:55:33", "remaining_time": "2:24:25", "throughput": 12343.22, "total_tokens": 130023424}
+{"current_steps": 63, "total_steps": 113, "loss": 0.7564, "lr": 2.0506771962705304e-05, "epoch": 0.5562913907284768, "percentage": 55.75, "elapsed_time": "2:58:21", "remaining_time": "2:21:33", "throughput": 12346.02, "total_tokens": 132120576}
+{"current_steps": 64, "total_steps": 113, "loss": 0.744, "lr": 1.982487176740627e-05, "epoch": 0.565121412803532, "percentage": 56.64, "elapsed_time": "3:01:10", "remaining_time": "2:18:42", "throughput": 12347.51, "total_tokens": 134217728}
+{"current_steps": 65, "total_steps": 113, "loss": 0.7523, "lr": 1.9146971351147655e-05, "epoch": 0.5739514348785872, "percentage": 57.52, "elapsed_time": "3:03:57", "remaining_time": "2:15:51", "throughput": 12349.65, "total_tokens": 136314880}
+{"current_steps": 66, "total_steps": 113, "loss": 0.7274, "lr": 1.847359465300006e-05, "epoch": 0.5827814569536424, "percentage": 58.41, "elapsed_time": "3:06:46", "remaining_time": "2:13:00", "throughput": 12351.39, "total_tokens": 138412032}
+{"current_steps": 67, "total_steps": 113, "loss": 0.746, "lr": 1.780526211572016e-05, "epoch": 0.5916114790286976, "percentage": 59.29, "elapsed_time": "3:09:34", "remaining_time": "2:10:09", "throughput": 12353.19, "total_tokens": 140509184}
+{"current_steps": 68, "total_steps": 113, "loss": 0.7423, "lr": 1.7142490283508324e-05, "epoch": 0.6004415011037527, "percentage": 60.18, "elapsed_time": "3:12:21", "remaining_time": "2:07:18", "throughput": 12355.43, "total_tokens": 142606336}
+{"current_steps": 69, "total_steps": 113, "loss": 0.7234, "lr": 1.648579140277931e-05, "epoch": 0.609271523178808, "percentage": 61.06, "elapsed_time": "3:15:09", "remaining_time": "2:04:26", "throughput": 12357.73, "total_tokens": 144703488}
+{"current_steps": 70, "total_steps": 113, "loss": 0.7495, "lr": 1.583567302625469e-05, "epoch": 0.6181015452538632, "percentage": 61.95, "elapsed_time": "3:17:57", "remaining_time": "2:01:35", "throughput": 12359.87, "total_tokens": 146800640}
+{"current_steps": 71, "total_steps": 113, "loss": 0.7227, "lr": 1.5192637620682981e-05, "epoch": 0.6269315673289183, "percentage": 62.83, "elapsed_time": "3:20:45", "remaining_time": "1:58:45", "throughput": 12361.65, "total_tokens": 148897792}
+{"current_steps": 72, "total_steps": 113, "loss": 0.7607, "lr": 1.4557182178490636e-05, "epoch": 0.6357615894039735, "percentage": 63.72, "elapsed_time": "3:23:32", "remaining_time": "1:55:54", "throughput": 12363.6, "total_tokens": 150994944}
+{"current_steps": 73, "total_steps": 113, "loss": 0.7296, "lr": 1.3929797833664013e-05, "epoch": 0.6445916114790287, "percentage": 64.6, "elapsed_time": "3:26:21", "remaining_time": "1:53:04", "throughput": 12364.6, "total_tokens": 153092096}
+{"current_steps": 74, "total_steps": 113, "loss": 0.7361, "lr": 1.3310969482159297e-05, "epoch": 0.6534216335540839, "percentage": 65.49, "elapsed_time": "3:29:09", "remaining_time": "1:50:13", "throughput": 12366.65, "total_tokens": 155189248}
+{"current_steps": 75, "total_steps": 113, "loss": 0.7379, "lr": 1.270117540713368e-05, "epoch": 0.6622516556291391, "percentage": 66.37, "elapsed_time": "3:31:57", "remaining_time": "1:47:23", "throughput": 12368.13, "total_tokens": 157286400}
+{"current_steps": 76, "total_steps": 113, "loss": 0.7351, "lr": 1.2100886909287478e-05, "epoch": 0.6710816777041942, "percentage": 67.26, "elapsed_time": "3:34:44", "remaining_time": "1:44:32", "throughput": 12369.85, "total_tokens": 159383552}
+{"current_steps": 77, "total_steps": 113, "loss": 0.7295, "lr": 1.151056794260289e-05, "epoch": 0.6799116997792495, "percentage": 68.14, "elapsed_time": "3:37:32", "remaining_time": "1:41:42", "throughput": 12371.69, "total_tokens": 161480704}
+{"current_steps": 78, "total_steps": 113, "loss": 0.7333, "lr": 1.0930674755760908e-05, "epoch": 0.6887417218543046, "percentage": 69.03, "elapsed_time": "3:40:20", "remaining_time": "1:38:52", "throughput": 12373.07, "total_tokens": 163577856}
+{"current_steps": 79, "total_steps": 113, "loss": 0.7334, "lr": 1.0361655539513565e-05, "epoch": 0.6975717439293598, "percentage": 69.91, "elapsed_time": "3:43:08", "remaining_time": "1:36:02", "throughput": 12374.64, "total_tokens": 165675008}
+{"current_steps": 80, "total_steps": 113, "loss": 0.7437, "lr": 9.803950080284005e-06, "epoch": 0.7064017660044151, "percentage": 70.8, "elapsed_time": "3:45:56", "remaining_time": "1:33:12", "throughput": 12375.68, "total_tokens": 167772160}
+{"current_steps": 81, "total_steps": 113, "loss": 0.7133, "lr": 9.257989420262151e-06, "epoch": 0.7152317880794702, "percentage": 71.68, "elapsed_time": "3:48:44", "remaining_time": "1:30:21", "throughput": 12377.37, "total_tokens": 169869312}
+{"current_steps": 82, "total_steps": 113, "loss": 0.7415, "lr": 8.724195524258688e-06, "epoch": 0.7240618101545254, "percentage": 72.57, "elapsed_time": "3:51:31", "remaining_time": "1:27:31", "throughput": 12379.17, "total_tokens": 171966464}
+{"current_steps": 83, "total_steps": 113, "loss": 0.7423, "lr": 8.202980953574735e-06, "epoch": 0.7328918322295805, "percentage": 73.45, "elapsed_time": "3:54:19", "remaining_time": "1:24:41", "throughput": 12380.31, "total_tokens": 174063616}
+{"current_steps": 84, "total_steps": 113, "loss": 0.7125, "lr": 7.69474854713943e-06, "epoch": 0.7417218543046358, "percentage": 74.34, "elapsed_time": "3:57:08", "remaining_time": "1:21:52", "throughput": 12381.1, "total_tokens": 176160768}
+{"current_steps": 85, "total_steps": 113, "loss": 0.7346, "lr": 7.1998911101617575e-06, "epoch": 0.7505518763796909, "percentage": 75.22, "elapsed_time": "3:59:56", "remaining_time": "1:19:02", "throughput": 12382.4, "total_tokens": 178257920}
+{"current_steps": 86, "total_steps": 113, "loss": 0.7235, "lr": 6.718791110537287e-06, "epoch": 0.7593818984547461, "percentage": 76.11, "elapsed_time": "4:02:44", "remaining_time": "1:16:12", "throughput": 12383.21, "total_tokens": 180355072}
+{"current_steps": 87, "total_steps": 113, "loss": 0.7268, "lr": 6.25182038324447e-06, "epoch": 0.7682119205298014, "percentage": 76.99, "elapsed_time": "4:05:32", "remaining_time": "1:13:22", "throughput": 12384.33, "total_tokens": 182452224}
+{"current_steps": 88, "total_steps": 113, "loss": 0.7251, "lr": 5.7993398429589506e-06, "epoch": 0.7770419426048565, "percentage": 77.88, "elapsed_time": "4:08:20", "remaining_time": "1:10:33", "throughput": 12385.44, "total_tokens": 184549376}
+{"current_steps": 89, "total_steps": 113, "loss": 0.7216, "lr": 5.361699205108042e-06, "epoch": 0.7858719646799117, "percentage": 78.76, "elapsed_time": "4:11:08", "remaining_time": "1:07:43", "throughput": 12386.47, "total_tokens": 186646528}
+{"current_steps": 90, "total_steps": 113, "loss": 0.7386, "lr": 4.939236715580884e-06, "epoch": 0.7947019867549668, "percentage": 79.65, "elapsed_time": "4:13:57", "remaining_time": "1:04:53", "throughput": 12387.18, "total_tokens": 188743680}
+{"current_steps": 91, "total_steps": 113, "loss": 0.734, "lr": 4.5322788893033155e-06, "epoch": 0.8035320088300221, "percentage": 80.53, "elapsed_time": "4:16:45", "remaining_time": "1:02:04", "throughput": 12387.88, "total_tokens": 190840832}
+{"current_steps": 92, "total_steps": 113, "loss": 0.7519, "lr": 4.14114025787932e-06, "epoch": 0.8123620309050773, "percentage": 81.42, "elapsed_time": "4:19:33", "remaining_time": "0:59:14", "throughput": 12389.08, "total_tokens": 192937984}
+{"current_steps": 93, "total_steps": 113, "loss": 0.7194, "lr": 3.7661231264943086e-06, "epoch": 0.8211920529801324, "percentage": 82.3, "elapsed_time": "4:22:21", "remaining_time": "0:56:25", "throughput": 12389.85, "total_tokens": 195035136}
+{"current_steps": 94, "total_steps": 113, "loss": 0.7263, "lr": 3.4075173402679574e-06, "epoch": 0.8300220750551877, "percentage": 83.19, "elapsed_time": "4:25:10", "remaining_time": "0:53:35", "throughput": 12390.35, "total_tokens": 197132288}
+{"current_steps": 95, "total_steps": 113, "loss": 0.7093, "lr": 3.0656000602372558e-06, "epoch": 0.8388520971302428, "percentage": 84.07, "elapsed_time": "4:27:58", "remaining_time": "0:50:46", "throughput": 12391.05, "total_tokens": 199229440}
+{"current_steps": 96, "total_steps": 113, "loss": 0.7654, "lr": 2.7406355491429086e-06, "epoch": 0.847682119205298, "percentage": 84.96, "elapsed_time": "4:30:47", "remaining_time": "0:47:57", "throughput": 12391.45, "total_tokens": 201326592}
+{"current_steps": 97, "total_steps": 113, "loss": 0.7365, "lr": 2.4328749671846116e-06, "epoch": 0.8565121412803532, "percentage": 85.84, "elapsed_time": "4:33:35", "remaining_time": "0:45:07", "throughput": 12392.15, "total_tokens": 203423744}
+{"current_steps": 98, "total_steps": 113, "loss": 0.7468, "lr": 2.142556177903096e-06, "epoch": 0.8653421633554084, "percentage": 86.73, "elapsed_time": "4:36:23", "remaining_time": "0:42:18", "throughput": 12393.07, "total_tokens": 205520896}
+{"current_steps": 99, "total_steps": 113, "loss": 0.7264, "lr": 1.8699035643389928e-06, "epoch": 0.8741721854304636, "percentage": 87.61, "elapsed_time": "4:39:11", "remaining_time": "0:39:28", "throughput": 12394.01, "total_tokens": 207618048}
+{"current_steps": 100, "total_steps": 113, "loss": 0.7392, "lr": 1.615127855610496e-06, "epoch": 0.8830022075055187, "percentage": 88.5, "elapsed_time": "4:41:59", "remaining_time": "0:36:39", "throughput": 12395.06, "total_tokens": 209715200}
+{"current_steps": 101, "total_steps": 113, "loss": 0.7378, "lr": 1.3784259640440279e-06, "epoch": 0.891832229580574, "percentage": 89.38, "elapsed_time": "4:44:47", "remaining_time": "0:33:50", "throughput": 12396.1, "total_tokens": 211812352}
+{"current_steps": 102, "total_steps": 113, "loss": 0.7384, "lr": 1.1599808329836177e-06, "epoch": 0.9006622516556292, "percentage": 90.27, "elapsed_time": "4:47:34", "remaining_time": "0:31:00", "throughput": 12397.09, "total_tokens": 213909504}
+{"current_steps": 103, "total_steps": 113, "loss": 0.7326, "lr": 9.599612953967746e-07, "epoch": 0.9094922737306843, "percentage": 91.15, "elapsed_time": "4:50:23", "remaining_time": "0:28:11", "throughput": 12397.55, "total_tokens": 216006656}
+{"current_steps": 104, "total_steps": 113, "loss": 0.7319, "lr": 7.785219433859847e-07, "epoch": 0.9183222958057395, "percentage": 92.04, "elapsed_time": "4:53:11", "remaining_time": "0:25:22", "throughput": 12398.3, "total_tokens": 218103808}
+{"current_steps": 105, "total_steps": 113, "loss": 0.7365, "lr": 6.158030087068001e-07, "epoch": 0.9271523178807947, "percentage": 92.92, "elapsed_time": "4:55:59", "remaining_time": "0:22:33", "throughput": 12399.3, "total_tokens": 220200960}
+{"current_steps": 106, "total_steps": 113, "loss": 0.75, "lr": 4.719302543848225e-07, "epoch": 0.9359823399558499, "percentage": 93.81, "elapsed_time": "4:58:46", "remaining_time": "0:19:43", "throughput": 12400.52, "total_tokens": 222298112}
+{"current_steps": 107, "total_steps": 113, "loss": 0.7468, "lr": 3.470148775153448e-07, "epoch": 0.9448123620309051, "percentage": 94.69, "elapsed_time": "5:01:34", "remaining_time": "0:16:54", "throughput": 12401.53, "total_tokens": 224395264}
+{"current_steps": 108, "total_steps": 113, "loss": 0.7425, "lr": 2.4115342332078074e-07, "epoch": 0.9536423841059603, "percentage": 95.58, "elapsed_time": "5:04:21", "remaining_time": "0:14:05", "throughput": 12402.6, "total_tokens": 226492416}
+{"current_steps": 109, "total_steps": 113, "loss": 0.7314, "lr": 1.5442771053230665e-07, "epoch": 0.9624724061810155, "percentage": 96.46, "elapsed_time": "5:07:14", "remaining_time": "0:11:16", "throughput": 12399.84, "total_tokens": 228589568}
+{"current_steps": 110, "total_steps": 113, "loss": 0.7407, "lr": 8.690476815339244e-08, "epoch": 0.9713024282560706, "percentage": 97.35, "elapsed_time": "5:10:04", "remaining_time": "0:08:27", "throughput": 12399.79, "total_tokens": 230686720}
+{"current_steps": 111, "total_steps": 113, "loss": 0.729, "lr": 3.8636783654100174e-08, "epoch": 0.9801324503311258, "percentage": 98.23, "elapsed_time": "5:12:51", "remaining_time": "0:05:38", "throughput": 12400.69, "total_tokens": 232783872}
+{"current_steps": 112, "total_steps": 113, "loss": 0.7333, "lr": 9.661062636148744e-09, "epoch": 0.9889624724061811, "percentage": 99.12, "elapsed_time": "5:15:40", "remaining_time": "0:02:49", "throughput": 12401.32, "total_tokens": 234881024}
+{"current_steps": 113, "total_steps": 113, "loss": 0.7217, "lr": 0.0, "epoch": 0.9977924944812362, "percentage": 100.0, "elapsed_time": "5:18:28", "remaining_time": "0:00:00", "throughput": 12401.67, "total_tokens": 236978176}
+{"current_steps": 113, "total_steps": 113, "epoch": 0.9977924944812362, "percentage": 100.0, "elapsed_time": "5:18:52", "remaining_time": "0:00:00", "throughput": 12385.92, "total_tokens": 236978176}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,947 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9977924944812362,
+  "eval_steps": 500,
+  "global_step": 113,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008830022075055188,
+      "grad_norm": 0.8537317514419556,
+      "learning_rate": 4.999033893736386e-05,
+      "loss": 0.9875,
+      "num_input_tokens_seen": 2097152,
+      "step": 1
+    },
+    {
+      "epoch": 0.017660044150110375,
+      "grad_norm": 0.7285566926002502,
+      "learning_rate": 4.99613632163459e-05,
+      "loss": 0.944,
+      "num_input_tokens_seen": 4194304,
+      "step": 2
+    },
+    {
+      "epoch": 0.026490066225165563,
+      "grad_norm": 0.5551783442497253,
+      "learning_rate": 4.991309523184661e-05,
+      "loss": 0.904,
+      "num_input_tokens_seen": 6291456,
+      "step": 3
+    },
+    {
+      "epoch": 0.03532008830022075,
+      "grad_norm": 0.4819079041481018,
+      "learning_rate": 4.98455722894677e-05,
+      "loss": 0.8913,
+      "num_input_tokens_seen": 8388608,
+      "step": 4
+    },
+    {
+      "epoch": 0.04415011037527594,
+      "grad_norm": 0.373820960521698,
+      "learning_rate": 4.975884657667922e-05,
+      "loss": 0.8511,
+      "num_input_tokens_seen": 10485760,
+      "step": 5
+    },
+    {
+      "epoch": 0.052980132450331126,
+      "grad_norm": 0.2974053621292114,
+      "learning_rate": 4.965298512248466e-05,
+      "loss": 0.8311,
+      "num_input_tokens_seen": 12582912,
+      "step": 6
+    },
+    {
+      "epoch": 0.06181015452538632,
+      "grad_norm": 0.25162145495414734,
+      "learning_rate": 4.952806974561518e-05,
+      "loss": 0.838,
+      "num_input_tokens_seen": 14680064,
+      "step": 7
+    },
+    {
+      "epoch": 0.0706401766004415,
+      "grad_norm": 0.20863372087478638,
+      "learning_rate": 4.9384196991293205e-05,
+      "loss": 0.8468,
+      "num_input_tokens_seen": 16777216,
+      "step": 8
+    },
+    {
+      "epoch": 0.07947019867549669,
+      "grad_norm": 0.18535731732845306,
+      "learning_rate": 4.922147805661402e-05,
+      "loss": 0.7884,
+      "num_input_tokens_seen": 18874368,
+      "step": 9
+    },
+    {
+      "epoch": 0.08830022075055188,
+      "grad_norm": 0.16335929930210114,
+      "learning_rate": 4.904003870460323e-05,
+      "loss": 0.8009,
+      "num_input_tokens_seen": 20971520,
+      "step": 10
+    },
+    {
+      "epoch": 0.09713024282560706,
+      "grad_norm": 0.1265847086906433,
+      "learning_rate": 4.884001916701639e-05,
+      "loss": 0.7764,
+      "num_input_tokens_seen": 23068672,
+      "step": 11
+    },
+    {
+      "epoch": 0.10596026490066225,
+      "grad_norm": 0.11690282076597214,
+      "learning_rate": 4.862157403595598e-05,
+      "loss": 0.7774,
+      "num_input_tokens_seen": 25165824,
+      "step": 12
+    },
+    {
+      "epoch": 0.11479028697571744,
+      "grad_norm": 0.10680913180112839,
+      "learning_rate": 4.838487214438951e-05,
+      "loss": 0.7808,
+      "num_input_tokens_seen": 27262976,
+      "step": 13
+    },
+    {
+      "epoch": 0.12362030905077263,
+      "grad_norm": 0.09127331525087357,
+      "learning_rate": 4.813009643566101e-05,
+      "loss": 0.8111,
+      "num_input_tokens_seen": 29360128,
+      "step": 14
+    },
+    {
+      "epoch": 0.13245033112582782,
+      "grad_norm": 0.08718351274728775,
+      "learning_rate": 4.7857443822096905e-05,
+      "loss": 0.7997,
+      "num_input_tokens_seen": 31457280,
+      "step": 15
+    },
+    {
+      "epoch": 0.141280353200883,
+      "grad_norm": 0.06960419565439224,
+      "learning_rate": 4.7567125032815394e-05,
+      "loss": 0.7537,
+      "num_input_tokens_seen": 33554432,
+      "step": 16
+    },
+    {
+      "epoch": 0.15011037527593818,
+      "grad_norm": 0.0721953734755516,
+      "learning_rate": 4.7259364450857096e-05,
+      "loss": 0.7645,
+      "num_input_tokens_seen": 35651584,
+      "step": 17
+    },
+    {
+      "epoch": 0.15894039735099338,
+      "grad_norm": 0.0680830106139183,
+      "learning_rate": 4.6934399939762746e-05,
+      "loss": 0.7712,
+      "num_input_tokens_seen": 37748736,
+      "step": 18
+    },
+    {
+      "epoch": 0.16777041942604856,
+      "grad_norm": 0.06572319567203522,
+      "learning_rate": 4.659248265973205e-05,
+      "loss": 0.74,
+      "num_input_tokens_seen": 39845888,
+      "step": 19
+    },
+    {
+      "epoch": 0.17660044150110377,
+      "grad_norm": 0.06348133087158203,
+      "learning_rate": 4.6233876873505694e-05,
+      "loss": 0.777,
+      "num_input_tokens_seen": 41943040,
+      "step": 20
+    },
+    {
+      "epoch": 0.18543046357615894,
+      "grad_norm": 0.06005046144127846,
+      "learning_rate": 4.585885974212068e-05,
+      "loss": 0.7537,
+      "num_input_tokens_seen": 44040192,
+      "step": 21
+    },
+    {
+      "epoch": 0.19426048565121412,
+      "grad_norm": 0.057457394897937775,
+      "learning_rate": 4.5467721110696685e-05,
+      "loss": 0.7453,
+      "num_input_tokens_seen": 46137344,
+      "step": 22
+    },
+    {
+      "epoch": 0.20309050772626933,
+      "grad_norm": 0.054359156638383865,
+      "learning_rate": 4.5060763284419114e-05,
+      "loss": 0.7573,
+      "num_input_tokens_seen": 48234496,
+      "step": 23
+    },
+    {
+      "epoch": 0.2119205298013245,
+      "grad_norm": 0.054271843284368515,
+      "learning_rate": 4.463830079489196e-05,
+      "loss": 0.7626,
+      "num_input_tokens_seen": 50331648,
+      "step": 24
+    },
+    {
+      "epoch": 0.22075055187637968,
+      "grad_norm": 0.048914600163698196,
+      "learning_rate": 4.420066015704105e-05,
+      "loss": 0.7558,
+      "num_input_tokens_seen": 52428800,
+      "step": 25
+    },
+    {
+      "epoch": 0.22958057395143489,
+      "grad_norm": 0.04995320364832878,
+      "learning_rate": 4.374817961675553e-05,
+      "loss": 0.7654,
+      "num_input_tokens_seen": 54525952,
+      "step": 26
+    },
+    {
+      "epoch": 0.23841059602649006,
+      "grad_norm": 0.04679872468113899,
+      "learning_rate": 4.3281208889462715e-05,
+      "loss": 0.7363,
+      "num_input_tokens_seen": 56623104,
+      "step": 27
+    },
+    {
+      "epoch": 0.24724061810154527,
+      "grad_norm": 0.04957474768161774,
+      "learning_rate": 4.2800108889838244e-05,
+      "loss": 0.7503,
+      "num_input_tokens_seen": 58720256,
+      "step": 28
+    },
+    {
+      "epoch": 0.2560706401766004,
+      "grad_norm": 0.04368240758776665,
+      "learning_rate": 4.230525145286057e-05,
+      "loss": 0.7474,
+      "num_input_tokens_seen": 60817408,
+      "step": 29
+    },
+    {
+      "epoch": 0.26490066225165565,
+      "grad_norm": 0.04365404695272446,
+      "learning_rate": 4.1797019046425264e-05,
+      "loss": 0.7442,
+      "num_input_tokens_seen": 62914560,
+      "step": 30
+    },
+    {
+      "epoch": 0.2737306843267108,
+      "grad_norm": 0.04533332213759422,
+      "learning_rate": 4.127580447574131e-05,
+      "loss": 0.7492,
+      "num_input_tokens_seen": 65011712,
+      "step": 31
+    },
+    {
+      "epoch": 0.282560706401766,
+      "grad_norm": 0.045380424708127975,
+      "learning_rate": 4.0742010579737855e-05,
+      "loss": 0.7601,
+      "num_input_tokens_seen": 67108864,
+      "step": 32
+    },
+    {
+      "epoch": 0.2913907284768212,
+      "grad_norm": 0.04397201910614967,
+      "learning_rate": 4.0196049919716004e-05,
+      "loss": 0.7381,
+      "num_input_tokens_seen": 69206016,
+      "step": 33
+    },
+    {
+      "epoch": 0.30022075055187636,
+      "grad_norm": 0.042593635618686676,
+      "learning_rate": 3.963834446048644e-05,
+      "loss": 0.7406,
+      "num_input_tokens_seen": 71303168,
+      "step": 34
+    },
+    {
+      "epoch": 0.3090507726269316,
+      "grad_norm": 0.04130704328417778,
+      "learning_rate": 3.9069325244239095e-05,
+      "loss": 0.7573,
+      "num_input_tokens_seen": 73400320,
+      "step": 35
+    },
+    {
+      "epoch": 0.31788079470198677,
+      "grad_norm": 0.04113290086388588,
+      "learning_rate": 3.848943205739711e-05,
+      "loss": 0.7419,
+      "num_input_tokens_seen": 75497472,
+      "step": 36
+    },
+    {
+      "epoch": 0.32671081677704195,
+      "grad_norm": 0.0397503562271595,
+      "learning_rate": 3.7899113090712526e-05,
+      "loss": 0.7357,
+      "num_input_tokens_seen": 77594624,
+      "step": 37
+    },
+    {
+      "epoch": 0.3355408388520971,
+      "grad_norm": 0.03884141892194748,
+      "learning_rate": 3.729882459286632e-05,
+      "loss": 0.7346,
+      "num_input_tokens_seen": 79691776,
+      "step": 38
+    },
+    {
+      "epoch": 0.3443708609271523,
+      "grad_norm": 0.03996223211288452,
+      "learning_rate": 3.66890305178407e-05,
+      "loss": 0.7437,
+      "num_input_tokens_seen": 81788928,
+      "step": 39
+    },
+    {
+      "epoch": 0.35320088300220753,
+      "grad_norm": 0.0395955815911293,
+      "learning_rate": 3.607020216633599e-05,
+      "loss": 0.742,
+      "num_input_tokens_seen": 83886080,
+      "step": 40
+    },
+    {
+      "epoch": 0.3620309050772627,
+      "grad_norm": 0.03895486891269684,
+      "learning_rate": 3.544281782150936e-05,
+      "loss": 0.7136,
+      "num_input_tokens_seen": 85983232,
+      "step": 41
+    },
+    {
+      "epoch": 0.3708609271523179,
+      "grad_norm": 0.03683311864733696,
+      "learning_rate": 3.4807362379317025e-05,
+      "loss": 0.7417,
+      "num_input_tokens_seen": 88080384,
+      "step": 42
+    },
+    {
+      "epoch": 0.37969094922737306,
+      "grad_norm": 0.037801578640937805,
+      "learning_rate": 3.416432697374533e-05,
+      "loss": 0.7102,
+      "num_input_tokens_seen": 90177536,
+      "step": 43
+    },
+    {
+      "epoch": 0.38852097130242824,
+      "grad_norm": 0.03917045146226883,
+      "learning_rate": 3.3514208597220705e-05,
+      "loss": 0.7685,
+      "num_input_tokens_seen": 92274688,
+      "step": 44
+    },
+    {
+      "epoch": 0.3973509933774834,
+      "grad_norm": 0.037231214344501495,
+      "learning_rate": 3.285750971649167e-05,
+      "loss": 0.7332,
+      "num_input_tokens_seen": 94371840,
+      "step": 45
+    },
+    {
+      "epoch": 0.40618101545253865,
+      "grad_norm": 0.03943084925413132,
+      "learning_rate": 3.219473788427984e-05,
+      "loss": 0.7387,
+      "num_input_tokens_seen": 96468992,
+      "step": 46
+    },
+    {
+      "epoch": 0.41501103752759383,
+      "grad_norm": 0.03614073246717453,
+      "learning_rate": 3.1526405346999946e-05,
+      "loss": 0.7096,
+      "num_input_tokens_seen": 98566144,
+      "step": 47
+    },
+    {
+      "epoch": 0.423841059602649,
+      "grad_norm": 0.03576625511050224,
+      "learning_rate": 3.085302864885235e-05,
+      "loss": 0.7242,
+      "num_input_tokens_seen": 100663296,
+      "step": 48
+    },
+    {
+      "epoch": 0.4326710816777042,
+      "grad_norm": 0.0361945666372776,
+      "learning_rate": 3.017512823259373e-05,
+      "loss": 0.7338,
+      "num_input_tokens_seen": 102760448,
+      "step": 49
+    },
+    {
+      "epoch": 0.44150110375275936,
+      "grad_norm": 0.03698369115591049,
+      "learning_rate": 2.9493228037294702e-05,
+      "loss": 0.7494,
+      "num_input_tokens_seen": 104857600,
+      "step": 50
+    },
+    {
+      "epoch": 0.4503311258278146,
+      "grad_norm": 0.03586776927113533,
+      "learning_rate": 2.8807855093395126e-05,
+      "loss": 0.7272,
+      "num_input_tokens_seen": 106954752,
+      "step": 51
+    },
+    {
+      "epoch": 0.45916114790286977,
+      "grad_norm": 0.03969455882906914,
+      "learning_rate": 2.8119539115370218e-05,
+      "loss": 0.7447,
+      "num_input_tokens_seen": 109051904,
+      "step": 52
+    },
+    {
+      "epoch": 0.46799116997792495,
+      "grad_norm": 0.037024304270744324,
+      "learning_rate": 2.742881209232215e-05,
+      "loss": 0.7219,
+      "num_input_tokens_seen": 111149056,
+      "step": 53
+    },
+    {
+      "epoch": 0.4768211920529801,
+      "grad_norm": 0.035661764442920685,
+      "learning_rate": 2.6736207876813646e-05,
+      "loss": 0.7484,
+      "num_input_tokens_seen": 113246208,
+      "step": 54
+    },
+    {
+      "epoch": 0.4856512141280353,
+      "grad_norm": 0.03690381348133087,
+      "learning_rate": 2.604226177226137e-05,
+      "loss": 0.7352,
+      "num_input_tokens_seen": 115343360,
+      "step": 55
+    },
+    {
+      "epoch": 0.49448123620309054,
+      "grad_norm": 0.03607625514268875,
+      "learning_rate": 2.5347510119207878e-05,
+      "loss": 0.7254,
+      "num_input_tokens_seen": 117440512,
+      "step": 56
+    },
+    {
+      "epoch": 0.5033112582781457,
+      "grad_norm": 0.036771535873413086,
+      "learning_rate": 2.4652489880792128e-05,
+      "loss": 0.724,
+      "num_input_tokens_seen": 119537664,
+      "step": 57
+    },
+    {
+      "epoch": 0.5121412803532008,
+      "grad_norm": 0.035380277782678604,
+      "learning_rate": 2.395773822773863e-05,
+      "loss": 0.747,
+      "num_input_tokens_seen": 121634816,
+      "step": 58
+    },
+    {
+      "epoch": 0.5209713024282561,
+      "grad_norm": 0.03334279730916023,
+      "learning_rate": 2.3263792123186353e-05,
+      "loss": 0.7246,
+      "num_input_tokens_seen": 123731968,
+      "step": 59
+    },
+    {
+      "epoch": 0.5298013245033113,
+      "grad_norm": 0.03534376993775368,
+      "learning_rate": 2.2571187907677853e-05,
+      "loss": 0.7424,
+      "num_input_tokens_seen": 125829120,
+      "step": 60
+    },
+    {
+      "epoch": 0.5386313465783664,
+      "grad_norm": 0.035675469785928726,
+      "learning_rate": 2.188046088462979e-05,
+      "loss": 0.7422,
+      "num_input_tokens_seen": 127926272,
+      "step": 61
+    },
+    {
+      "epoch": 0.5474613686534217,
+      "grad_norm": 0.03412646800279617,
+      "learning_rate": 2.1192144906604876e-05,
+      "loss": 0.7166,
+      "num_input_tokens_seen": 130023424,
+      "step": 62
+    },
+    {
+      "epoch": 0.5562913907284768,
+      "grad_norm": 0.03617184981703758,
+      "learning_rate": 2.0506771962705304e-05,
+      "loss": 0.7564,
+      "num_input_tokens_seen": 132120576,
+      "step": 63
+    },
+    {
+      "epoch": 0.565121412803532,
+      "grad_norm": 0.03402591496706009,
+      "learning_rate": 1.982487176740627e-05,
+      "loss": 0.744,
+      "num_input_tokens_seen": 134217728,
+      "step": 64
+    },
+    {
+      "epoch": 0.5739514348785872,
+      "grad_norm": 0.035137876868247986,
+      "learning_rate": 1.9146971351147655e-05,
+      "loss": 0.7523,
+      "num_input_tokens_seen": 136314880,
+      "step": 65
+    },
+    {
+      "epoch": 0.5827814569536424,
+      "grad_norm": 0.03335074707865715,
+      "learning_rate": 1.847359465300006e-05,
+      "loss": 0.7274,
+      "num_input_tokens_seen": 138412032,
+      "step": 66
+    },
+    {
+      "epoch": 0.5916114790286976,
+      "grad_norm": 0.032923776656389236,
+      "learning_rate": 1.780526211572016e-05,
+      "loss": 0.746,
+      "num_input_tokens_seen": 140509184,
+      "step": 67
+    },
+    {
+      "epoch": 0.6004415011037527,
+      "grad_norm": 0.038766708225011826,
+      "learning_rate": 1.7142490283508324e-05,
+      "loss": 0.7423,
+      "num_input_tokens_seen": 142606336,
+      "step": 68
+    },
+    {
+      "epoch": 0.609271523178808,
+      "grad_norm": 0.032216496765613556,
+      "learning_rate": 1.648579140277931e-05,
+      "loss": 0.7234,
+      "num_input_tokens_seen": 144703488,
+      "step": 69
+    },
+    {
+      "epoch": 0.6181015452538632,
+      "grad_norm": 0.03458288684487343,
+      "learning_rate": 1.583567302625469e-05,
+      "loss": 0.7495,
+      "num_input_tokens_seen": 146800640,
+      "step": 70
+    },
+    {
+      "epoch": 0.6269315673289183,
+      "grad_norm": 0.033755529671907425,
+      "learning_rate": 1.5192637620682981e-05,
+      "loss": 0.7227,
+      "num_input_tokens_seen": 148897792,
+      "step": 71
+    },
+    {
+      "epoch": 0.6357615894039735,
+      "grad_norm": 0.03497767448425293,
+      "learning_rate": 1.4557182178490636e-05,
+      "loss": 0.7607,
+      "num_input_tokens_seen": 150994944,
+      "step": 72
+    },
+    {
+      "epoch": 0.6445916114790287,
+      "grad_norm": 0.03499361500144005,
+      "learning_rate": 1.3929797833664013e-05,
+      "loss": 0.7296,
+      "num_input_tokens_seen": 153092096,
+      "step": 73
+    },
+    {
+      "epoch": 0.6534216335540839,
+      "grad_norm": 0.033749066293239594,
+      "learning_rate": 1.3310969482159297e-05,
+      "loss": 0.7361,
+      "num_input_tokens_seen": 155189248,
+      "step": 74
+    },
+    {
+      "epoch": 0.6622516556291391,
+      "grad_norm": 0.03505317121744156,
+      "learning_rate": 1.270117540713368e-05,
+      "loss": 0.7379,
+      "num_input_tokens_seen": 157286400,
+      "step": 75
+    },
+    {
+      "epoch": 0.6710816777041942,
+      "grad_norm": 0.035697367042303085,
+      "learning_rate": 1.2100886909287478e-05,
+      "loss": 0.7351,
+      "num_input_tokens_seen": 159383552,
+      "step": 76
+    },
+    {
+      "epoch": 0.6799116997792495,
+      "grad_norm": 0.035562630742788315,
+      "learning_rate": 1.151056794260289e-05,
+      "loss": 0.7295,
+      "num_input_tokens_seen": 161480704,
+      "step": 77
+    },
+    {
+      "epoch": 0.6887417218543046,
+      "grad_norm": 0.0363004133105278,
+      "learning_rate": 1.0930674755760908e-05,
+      "loss": 0.7333,
+      "num_input_tokens_seen": 163577856,
+      "step": 78
+    },
+    {
+      "epoch": 0.6975717439293598,
+      "grad_norm": 0.034324079751968384,
+      "learning_rate": 1.0361655539513565e-05,
+      "loss": 0.7334,
+      "num_input_tokens_seen": 165675008,
+      "step": 79
+    },
+    {
+      "epoch": 0.7064017660044151,
+      "grad_norm": 0.03306734561920166,
+      "learning_rate": 9.803950080284005e-06,
+      "loss": 0.7437,
+      "num_input_tokens_seen": 167772160,
+      "step": 80
+    },
+    {
+      "epoch": 0.7152317880794702,
+      "grad_norm": 0.03080984577536583,
+      "learning_rate": 9.257989420262151e-06,
+      "loss": 0.7133,
+      "num_input_tokens_seen": 169869312,
+      "step": 81
+    },
+    {
+      "epoch": 0.7240618101545254,
+      "grad_norm": 0.03477946296334267,
+      "learning_rate": 8.724195524258688e-06,
+      "loss": 0.7415,
+      "num_input_tokens_seen": 171966464,
+      "step": 82
+    },
+    {
+      "epoch": 0.7328918322295805,
+      "grad_norm": 0.03363949805498123,
+      "learning_rate": 8.202980953574735e-06,
+      "loss": 0.7423,
+      "num_input_tokens_seen": 174063616,
+      "step": 83
+    },
+    {
+      "epoch": 0.7417218543046358,
+      "grad_norm": 0.032141055911779404,
+      "learning_rate": 7.69474854713943e-06,
+      "loss": 0.7125,
+      "num_input_tokens_seen": 176160768,
+      "step": 84
+    },
+    {
+      "epoch": 0.7505518763796909,
+      "grad_norm": 0.03413500636816025,
+      "learning_rate": 7.1998911101617575e-06,
+      "loss": 0.7346,
+      "num_input_tokens_seen": 178257920,
+      "step": 85
+    },
+    {
+      "epoch": 0.7593818984547461,
+      "grad_norm": 0.032283272594213486,
+      "learning_rate": 6.718791110537287e-06,
+      "loss": 0.7235,
+      "num_input_tokens_seen": 180355072,
+      "step": 86
+    },
+    {
+      "epoch": 0.7682119205298014,
+      "grad_norm": 0.03352838382124901,
+      "learning_rate": 6.25182038324447e-06,
+      "loss": 0.7268,
+      "num_input_tokens_seen": 182452224,
+      "step": 87
+    },
+    {
+      "epoch": 0.7770419426048565,
+      "grad_norm": 0.033114444464445114,
+      "learning_rate": 5.7993398429589506e-06,
+      "loss": 0.7251,
+      "num_input_tokens_seen": 184549376,
+      "step": 88
+    },
+    {
+      "epoch": 0.7858719646799117,
+      "grad_norm": 0.03312879428267479,
+      "learning_rate": 5.361699205108042e-06,
+      "loss": 0.7216,
+      "num_input_tokens_seen": 186646528,
+      "step": 89
+    },
+    {
+      "epoch": 0.7947019867549668,
+      "grad_norm": 0.034220073372125626,
+      "learning_rate": 4.939236715580884e-06,
+      "loss": 0.7386,
+      "num_input_tokens_seen": 188743680,
+      "step": 90
+    },
+    {
+      "epoch": 0.8035320088300221,
+      "grad_norm": 0.03237886354327202,
+      "learning_rate": 4.5322788893033155e-06,
+      "loss": 0.734,
+      "num_input_tokens_seen": 190840832,
+      "step": 91
+    },
+    {
+      "epoch": 0.8123620309050773,
+      "grad_norm": 0.032567963004112244,
+      "learning_rate": 4.14114025787932e-06,
+      "loss": 0.7519,
+      "num_input_tokens_seen": 192937984,
+      "step": 92
+    },
+    {
+      "epoch": 0.8211920529801324,
+      "grad_norm": 0.032218772917985916,
+      "learning_rate": 3.7661231264943086e-06,
+      "loss": 0.7194,
+      "num_input_tokens_seen": 195035136,
+      "step": 93
+    },
+    {
+      "epoch": 0.8300220750551877,
+      "grad_norm": 0.0331319235265255,
+      "learning_rate": 3.4075173402679574e-06,
+      "loss": 0.7263,
+      "num_input_tokens_seen": 197132288,
+      "step": 94
+    },
+    {
+      "epoch": 0.8388520971302428,
+      "grad_norm": 0.03236314281821251,
+      "learning_rate": 3.0656000602372558e-06,
+      "loss": 0.7093,
+      "num_input_tokens_seen": 199229440,
+      "step": 95
+    },
+    {
+      "epoch": 0.847682119205298,
+      "grad_norm": 0.03465864434838295,
+      "learning_rate": 2.7406355491429086e-06,
+      "loss": 0.7654,
+      "num_input_tokens_seen": 201326592,
+      "step": 96
+    },
+    {
+      "epoch": 0.8565121412803532,
+      "grad_norm": 0.033873483538627625,
+      "learning_rate": 2.4328749671846116e-06,
+      "loss": 0.7365,
+      "num_input_tokens_seen": 203423744,
+      "step": 97
+    },
+    {
+      "epoch": 0.8653421633554084,
+      "grad_norm": 0.03332989290356636,
+      "learning_rate": 2.142556177903096e-06,
+      "loss": 0.7468,
+      "num_input_tokens_seen": 205520896,
+      "step": 98
+    },
+    {
+      "epoch": 0.8741721854304636,
+      "grad_norm": 0.033005475997924805,
+      "learning_rate": 1.8699035643389928e-06,
+      "loss": 0.7264,
+      "num_input_tokens_seen": 207618048,
+      "step": 99
+    },
+    {
+      "epoch": 0.8830022075055187,
+      "grad_norm": 0.034544438123703,
+      "learning_rate": 1.615127855610496e-06,
+      "loss": 0.7392,
+      "num_input_tokens_seen": 209715200,
+      "step": 100
+    },
+    {
+      "epoch": 0.891832229580574,
+      "grad_norm": 0.03568817302584648,
+      "learning_rate": 1.3784259640440279e-06,
+      "loss": 0.7378,
+      "num_input_tokens_seen": 211812352,
+      "step": 101
+    },
+    {
+      "epoch": 0.9006622516556292,
+      "grad_norm": 0.03453758731484413,
+      "learning_rate": 1.1599808329836177e-06,
+      "loss": 0.7384,
+      "num_input_tokens_seen": 213909504,
+      "step": 102
+    },
+    {
+      "epoch": 0.9094922737306843,
+      "grad_norm": 0.03320000693202019,
+      "learning_rate": 9.599612953967746e-07,
+      "loss": 0.7326,
+      "num_input_tokens_seen": 216006656,
+      "step": 103
+    },
+    {
+      "epoch": 0.9183222958057395,
+      "grad_norm": 0.033046990633010864,
+      "learning_rate": 7.785219433859847e-07,
+      "loss": 0.7319,
+      "num_input_tokens_seen": 218103808,
+      "step": 104
+    },
+    {
+      "epoch": 0.9271523178807947,
+      "grad_norm": 0.032943662256002426,
+      "learning_rate": 6.158030087068001e-07,
+      "loss": 0.7365,
+      "num_input_tokens_seen": 220200960,
+      "step": 105
+    },
+    {
+      "epoch": 0.9359823399558499,
+      "grad_norm": 0.03540504723787308,
+      "learning_rate": 4.719302543848225e-07,
+      "loss": 0.75,
+      "num_input_tokens_seen": 222298112,
+      "step": 106
+    },
+    {
+      "epoch": 0.9448123620309051,
+      "grad_norm": 0.03443380072712898,
+      "learning_rate": 3.470148775153448e-07,
+      "loss": 0.7468,
+      "num_input_tokens_seen": 224395264,
+      "step": 107
+    },
+    {
+      "epoch": 0.9536423841059603,
+      "grad_norm": 0.03330874443054199,
+      "learning_rate": 2.4115342332078074e-07,
+      "loss": 0.7425,
+      "num_input_tokens_seen": 226492416,
+      "step": 108
+    },
+    {
+      "epoch": 0.9624724061810155,
+      "grad_norm": 0.03456017002463341,
+      "learning_rate": 1.5442771053230665e-07,
+      "loss": 0.7314,
+      "num_input_tokens_seen": 228589568,
+      "step": 109
+    },
+    {
+      "epoch": 0.9713024282560706,
+      "grad_norm": 0.03423641249537468,
+      "learning_rate": 8.690476815339244e-08,
+      "loss": 0.7407,
+      "num_input_tokens_seen": 230686720,
+      "step": 110
+    },
+    {
+      "epoch": 0.9801324503311258,
+      "grad_norm": 0.03197889402508736,
+      "learning_rate": 3.8636783654100174e-08,
+      "loss": 0.729,
+      "num_input_tokens_seen": 232783872,
+      "step": 111
+    },
+    {
+      "epoch": 0.9889624724061811,
+      "grad_norm": 0.034751225262880325,
+      "learning_rate": 9.661062636148744e-09,
+      "loss": 0.7333,
+      "num_input_tokens_seen": 234881024,
+      "step": 112
+    },
+    {
+      "epoch": 0.9977924944812362,
+      "grad_norm": 0.03236055746674538,
+      "learning_rate": 0.0,
+      "loss": 0.7217,
+      "num_input_tokens_seen": 236978176,
+      "step": 113
+    },
+    {
+      "epoch": 0.9977924944812362,
+      "num_input_tokens_seen": 236978176,
+      "step": 113,
+      "total_flos": 1.0486889344470614e+19,
+      "train_loss": 0.7524756815581195,
+      "train_runtime": 19134.3763,
+      "train_samples_per_second": 3.027,
+      "train_steps_per_second": 0.006
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 113,
+  "num_input_tokens_seen": 236978176,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0486889344470614e+19,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f283812e758cb6fff59dbe2b0f45aae1e85b4a071df1fd285f6f24af74f6d93
+size 5624

training_args.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+apollo_rank: 256
+apollo_scale: 1
+apollo_target: all
+apollo_update_interval: 200
+bf16: true
+cutoff_len: 4096
+dataset: codes_330k_nsx
+dataset_dir: data
+ddp_timeout: 180000000
+do_train: true
+enable_liger_kernel: true
+finetuning_type: freeze
+flash_attn: auto
+freeze_trainable_layers: 2
+freeze_trainable_modules: all
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+learning_rate: 5.0e-05
+logging_steps: 1
+lr_scheduler_type: cosine
+max_grad_norm: 1.0
+max_samples: 50000000
+model_name_or_path: infly/OpenCoder-8B-Instruct
+neat_packing: true
+num_train_epochs: 1.0
+output_dir: saves/OpenCoder-8B-Instruct/freeze/opencoder_nsx
+packing: true
+per_device_train_batch_size: 16
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: none
+rope_scaling: llama3
+save_steps: 500
+stage: sft
+template: opencoder
+trust_remote_code: true
+use_apollo: true
+use_llama_pro: true
+warmup_steps: 0

training_loss.png ADDED Viewed