k1h0 commited on 23 days ago

Commit

7bc029d

verified ·

1 Parent(s): 12ffb67

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

README.md +60 -0
added_tokens.json +44 -0
all_results.json +9 -0
config.json +36 -0
generation_config.json +6 -0
llamaboard_config.yaml +77 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +298 -0
running_log.txt +100 -0
special_tokens_map.json +34 -0
tokenization_inflm.py +292 -0
tokenizer.model +3 -0
tokenizer_config.json +396 -0
train_results.json +9 -0
trainer_log.jsonl +56 -0
trainer_state.json +483 -0
training_args.bin +3 -0
training_args.yaml +39 -0
training_loss.png +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+library_name: transformers
+license: other
+base_model: infly/OpenCoder-8B-Instruct
+tags:
+- llama-factory
+- freeze
+- generated_from_trainer
+model-index:
+- name: opencoder-nlx-330k
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# opencoder-nlx-330k
+This model is a fine-tuned version of [infly/OpenCoder-8B-Instruct](https://huggingface.co/infly/OpenCoder-8B-Instruct) on the codes3_query_filtered_330k_nlx dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 16
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 512
+- total_eval_batch_size: 32
+- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- num_epochs: 1.0
+### Training results
+### Framework versions
+- Transformers 4.48.2
+- Pytorch 2.5.1+cu124
+- Datasets 3.2.0
+- Tokenizers 0.21.0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "<code_to_intermediate>": 96521,
+  "<empty_output>": 96520,
+  "<file_sep>": 96511,
+  "<fim_middle>": 96508,
+  "<fim_prefix>": 96507,
+  "<fim_suffix>": 96509,
+  "<intermediate_to_code>": 96522,
+  "<issue_closed>": 96514,
+  "<issue_comment>": 96513,
+  "<issue_start>": 96512,
+  "<jupyter_code>": 96517,
+  "<jupyter_output>": 96518,
+  "<jupyter_script>": 96519,
+  "<jupyter_start>": 96515,
+  "<jupyter_text>": 96516,
+  "<pr>": 96523,
+  "<pr_base>": 96526,
+  "<pr_base_code>": 96528,
+  "<pr_comment>": 96531,
+  "<pr_diff>": 96529,
+  "<pr_diff_hunk>": 96530,
+  "<pr_diff_hunk_comment_line>": 96538,
+  "<pr_event_id>": 96532,
+  "<pr_file>": 96527,
+  "<pr_in_reply_to_comment_id>": 96537,
+  "<pr_in_reply_to_review_id>": 96536,
+  "<pr_is_merged>": 96525,
+  "<pr_review>": 96533,
+  "<pr_review_comment>": 96535,
+  "<pr_review_state>": 96534,
+  "<pr_status>": 96524,
+  "<repo_name>": 96510,
+  "<|endoftext|>": 96506,
+  "<|end|>": 96500,
+  "<|im_end|>": 96539,
+  "<|im_start|>": 96540,
+  "<|message|>": 96501,
+  "<|pad|>": 96505,
+  "<|start|>": 96499,
+  "<|tool_end|>": 96504,
+  "<|tool_excute|>": 96503,
+  "<|tool_start|>": 96502
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9865470852017937,
+    "num_input_tokens_seen": 115343360,
+    "total_flos": 5.104238176512246e+18,
+    "train_loss": 0.6637221011248502,
+    "train_runtime": 9208.1472,
+    "train_samples_per_second": 3.097,
+    "train_steps_per_second": 0.006
+}

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_name_or_path": "infly/OpenCoder-8B-Instruct",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 96540,
+  "eos_token_id": 96539,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 1.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.2",
+  "use_cache": false,
+  "vocab_size": 96640
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 96540,
+  "eos_token_id": 96539,
+  "transformers_version": "4.48.2"
+}

llamaboard_config.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+top.booster: liger_kernel
+top.checkpoint_path: null
+top.finetuning_type: freeze
+top.model_name: OpenCoder-8B-Instruct
+top.quantization_bit: none
+top.quantization_method: bitsandbytes
+top.rope_scaling: llama3
+top.template: opencoder
+train.additional_target: ''
+train.apollo_rank: 256
+train.apollo_scale: 1
+train.apollo_target: all
+train.apollo_update_interval: 200
+train.badam_mode: layer
+train.badam_switch_interval: 50
+train.badam_switch_mode: ascending
+train.badam_update_ratio: 0.05
+train.batch_size: 16
+train.compute_type: bf16
+train.create_new_adapter: false
+train.cutoff_len: 4096
+train.dataset:
+- codes3_query_filtered_330k_nlx
+train.dataset_dir: data
+train.ds_offload: false
+train.ds_stage: none
+train.extra_args: '{}'
+train.freeze_extra_modules: ''
+train.freeze_trainable_layers: 2
+train.freeze_trainable_modules: all
+train.galore_rank: 16
+train.galore_scale: 2
+train.galore_target: all
+train.galore_update_interval: 200
+train.gradient_accumulation_steps: 8
+train.learning_rate: 5e-5
+train.logging_steps: 1
+train.lora_alpha: 16
+train.lora_dropout: 0
+train.lora_rank: 8
+train.lora_target: ''
+train.loraplus_lr_ratio: 0
+train.lr_scheduler_type: cosine
+train.mask_history: false
+train.max_grad_norm: '1.0'
+train.max_samples: '50000000'
+train.neat_packing: true
+train.neftune_alpha: 0
+train.num_train_epochs: '1'
+train.packing: true
+train.ppo_score_norm: false
+train.ppo_whiten_rewards: false
+train.pref_beta: 0.1
+train.pref_ftx: 0
+train.pref_loss: sigmoid
+train.report_to:
+- none
+train.resize_vocab: false
+train.reward_model: null
+train.save_steps: 500
+train.swanlab_api_key: ''
+train.swanlab_mode: cloud
+train.swanlab_project: llamafactory
+train.swanlab_run_name: ''
+train.swanlab_workspace: ''
+train.train_on_prompt: false
+train.training_stage: Supervised Fine-Tuning
+train.use_apollo: true
+train.use_badam: false
+train.use_dora: false
+train.use_galore: false
+train.use_llama_pro: true
+train.use_pissa: false
+train.use_rslora: false
+train.use_swanlab: false
+train.val_size: 0
+train.warmup_steps: 0

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:182be20097743536eb8333475d94fdb0eeea45e2d7f20103b65a61c2771ee9bf
+size 4919027568

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f78d374b731ab44b3915afaa2345ffb30fe42ec01a5a1f600f50515ba42d471d
+size 4915915128

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d197d5eeb8b914e9d57c0db5dd3e6c28057fb738a0af2c79f69e6603f5d64fde
+size 4999819112

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5deaa08aa225bc85b0184ec247c1096add649475c24a52eceecdae65336ba8e
+size 1580246000

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,298 @@

+{
+  "metadata": {
+    "total_size": 16414973952
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors"
+  }
+}

running_log.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+[INFO|2025-05-12 13:05:10] configuration_utils.py:696 >> loading configuration file config.json from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/config.json
+[INFO|2025-05-12 13:05:10] configuration_utils.py:768 >> Model config LlamaConfig {
+  "_name_or_path": "infly/OpenCoder-8B-Instruct",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 96540,
+  "eos_token_id": 96539,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.2",
+  "use_cache": true,
+  "vocab_size": 96640
+}
+[INFO|2025-05-12 13:05:13] tokenization_utils_base.py:2034 >> loading file ./tokenizer.model from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/./tokenizer.model
+[INFO|2025-05-12 13:05:13] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/added_tokens.json
+[INFO|2025-05-12 13:05:13] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/special_tokens_map.json
+[INFO|2025-05-12 13:05:13] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/tokenizer_config.json
+[INFO|2025-05-12 13:05:13] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at None
+[INFO|2025-05-12 13:05:13] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
+[INFO|2025-05-12 13:05:13] tokenization_utils_base.py:2304 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|2025-05-12 13:05:14] configuration_utils.py:696 >> loading configuration file config.json from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/config.json
+[INFO|2025-05-12 13:05:14] configuration_utils.py:768 >> Model config LlamaConfig {
+  "_name_or_path": "infly/OpenCoder-8B-Instruct",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 96540,
+  "eos_token_id": 96539,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.2",
+  "use_cache": true,
+  "vocab_size": 96640
+}
+[INFO|2025-05-12 13:05:15] tokenization_utils_base.py:2034 >> loading file ./tokenizer.model from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/./tokenizer.model
+[INFO|2025-05-12 13:05:15] tokenization_utils_base.py:2034 >> loading file added_tokens.json from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/added_tokens.json
+[INFO|2025-05-12 13:05:15] tokenization_utils_base.py:2034 >> loading file special_tokens_map.json from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/special_tokens_map.json
+[INFO|2025-05-12 13:05:15] tokenization_utils_base.py:2034 >> loading file tokenizer_config.json from cache at /home/kiho/.cache/huggingface/hub/models--infly--OpenCoder-8B-Instruct/snapshots/01badbbf10c2dfd7e2a0b5f570065ef44548576c/tokenizer_config.json
+[INFO|2025-05-12 13:05:15] tokenization_utils_base.py:2034 >> loading file tokenizer.json from cache at None
+[INFO|2025-05-12 13:05:15] tokenization_utils_base.py:2034 >> loading file chat_template.jinja from cache at None
+[INFO|2025-05-12 13:05:15] tokenization_utils_base.py:2304 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|2025-05-12 13:05:15] logging.py:157 >> Add <|im_end|> to stop words.
+[INFO|2025-05-12 13:05:15] logging.py:157 >> Loading dataset Codes3_query_filtered_330k_nlx.json...

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_start|>"
+  ],
+  "bos_token": {
+    "content": "<|im_start|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_inflm.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for INFLMTokenizer."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+from tokenizers import pre_tokenizers,Regex,decoders
+from tokenizers.pre_tokenizers import Digits, Split, ByteLevel
+import os
+# same as gpt4 cl-base-100k
+PATTERN = Regex("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+\s+(\S)+")
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+PRETRAINED_VOCAB_FILES_MAP = {}
+class INFLMTokenizer(PreTrainedTokenizer):
+    """
+    Construct a INFLMTokenizer tokenizer based on sentence-piece
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+    _auto_class = "AutoTokenizer"
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=False,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        spaces_between_special_tokens=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self._no_prefix_space_tokens = None
+        self.pre_tokenizer = pre_tokenizers.Sequence([Split(pattern =PATTERN,behavior = "isolated", invert = False)])
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+        """ Initialisation"""
+    @property
+    def no_prefix_space_tokens(self):
+        if self._no_prefix_space_tokens is None:
+            vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+            self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+        return self._no_prefix_space_tokens
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.sp_model.eos_id()
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        splits = self.pre_tokenizer.pre_tokenize_str(text)
+        texts=[]
+        for split in splits:
+            texts.extend(self.sp_model.encode(split[0], out_type=str))
+        return texts
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def _maybe_add_prefix_space(self, tokens, decoded):
+        if tokens and tokens[0] not in self.no_prefix_space_tokens:
+            return " " + decoded
+        else:
+            return decoded
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+        output = bos_token_ids + token_ids_0
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+        return output
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return  ([0] * len(token_ids_0)) + eos_token_id
+        return  ([0] * len(token_ids_0)) + eos_token_id + ([0] * len(token_ids_1)) + eos_token_id
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Note this is only used for back compatiblity, thus list of zero is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+    @property
+    def default_chat_template(self):
+        return None
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        # default spaces_between_special_tokens should be false.
+        if spaces_between_special_tokens:
+            logger.warning_once('spaces_between_special_tokens is set. \
+                                It has no effect for bos,eos,pad,unk when transformers<=4.38.')
+        return super().decode(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76d43d618fc0c5a7c79dc4e72579f9f29bb803b36e4a4d709d1233626fd8fe2a
+size 1535725

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,396 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96499": {
+      "content": "<|start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96500": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96501": {
+      "content": "<|message|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96502": {
+      "content": "<|tool_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96503": {
+      "content": "<|tool_excute|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96504": {
+      "content": "<|tool_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96505": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96506": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96507": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96508": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96509": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96510": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96511": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96512": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96513": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96514": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96515": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96516": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96517": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96518": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96519": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96520": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96521": {
+      "content": "<code_to_intermediate>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96522": {
+      "content": "<intermediate_to_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96523": {
+      "content": "<pr>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96524": {
+      "content": "<pr_status>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96525": {
+      "content": "<pr_is_merged>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96526": {
+      "content": "<pr_base>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96527": {
+      "content": "<pr_file>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96528": {
+      "content": "<pr_base_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96529": {
+      "content": "<pr_diff>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96530": {
+      "content": "<pr_diff_hunk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96531": {
+      "content": "<pr_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96532": {
+      "content": "<pr_event_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96533": {
+      "content": "<pr_review>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96534": {
+      "content": "<pr_review_state>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96535": {
+      "content": "<pr_review_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96536": {
+      "content": "<pr_in_reply_to_review_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96537": {
+      "content": "<pr_in_reply_to_comment_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96538": {
+      "content": "<pr_diff_hunk_comment_line>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96539": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96540": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_start|>"
+  ],
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_inflm.INFLMTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|im_start|>",
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are OpenCoder, created by OpenCoder Team.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "model_max_length": 4096,
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "return_tensors": true,
+  "spaces_between_special_tokens": false,
+  "split_special_tokens": false,
+  "tokenizer_class": "INFLMTokenizer",
+  "unk_token": "<unk>"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9865470852017937,
+    "num_input_tokens_seen": 115343360,
+    "total_flos": 5.104238176512246e+18,
+    "train_loss": 0.6637221011248502,
+    "train_runtime": 9208.1472,
+    "train_samples_per_second": 3.097,
+    "train_steps_per_second": 0.006
+}

trainer_log.jsonl ADDED Viewed

	@@ -0,0 +1,56 @@

+{"current_steps": 1, "total_steps": 55, "loss": 0.8371, "lr": 4.995922759815339e-05, "epoch": 0.017937219730941704, "percentage": 1.82, "elapsed_time": "0:02:58", "remaining_time": "2:40:59", "throughput": 11723.96, "total_tokens": 2097152}
+{"current_steps": 2, "total_steps": 55, "loss": 0.7804, "lr": 4.9837043383713753e-05, "epoch": 0.03587443946188341, "percentage": 3.64, "elapsed_time": "0:05:45", "remaining_time": "2:32:35", "throughput": 12140.47, "total_tokens": 4194304}
+{"current_steps": 3, "total_steps": 55, "loss": 0.7695, "lr": 4.963384589619233e-05, "epoch": 0.053811659192825115, "percentage": 5.45, "elapsed_time": "0:08:32", "remaining_time": "2:27:59", "throughput": 12280.9, "total_tokens": 6291456}
+{"current_steps": 4, "total_steps": 55, "loss": 0.7419, "lr": 4.935029792355834e-05, "epoch": 0.07174887892376682, "percentage": 7.27, "elapsed_time": "0:11:18", "remaining_time": "2:24:10", "throughput": 12363.46, "total_tokens": 8388608}
+{"current_steps": 5, "total_steps": 55, "loss": 0.7166, "lr": 4.898732434036244e-05, "epoch": 0.08968609865470852, "percentage": 9.09, "elapsed_time": "0:14:04", "remaining_time": "2:20:47", "throughput": 12412.88, "total_tokens": 10485760}
+{"current_steps": 6, "total_steps": 55, "loss": 0.7194, "lr": 4.854610909098812e-05, "epoch": 0.10762331838565023, "percentage": 10.91, "elapsed_time": "0:16:51", "remaining_time": "2:17:39", "throughput": 12441.93, "total_tokens": 12582912}
+{"current_steps": 7, "total_steps": 55, "loss": 0.6975, "lr": 4.802809132787125e-05, "epoch": 0.12556053811659193, "percentage": 12.73, "elapsed_time": "0:19:37", "remaining_time": "2:14:33", "throughput": 12468.67, "total_tokens": 14680064}
+{"current_steps": 8, "total_steps": 55, "loss": 0.7168, "lr": 4.743496071728396e-05, "epoch": 0.14349775784753363, "percentage": 14.55, "elapsed_time": "0:22:23", "remaining_time": "2:11:33", "throughput": 12487.26, "total_tokens": 16777216}
+{"current_steps": 9, "total_steps": 55, "loss": 0.6707, "lr": 4.6768651927994434e-05, "epoch": 0.16143497757847533, "percentage": 16.36, "elapsed_time": "0:25:10", "remaining_time": "2:08:38", "throughput": 12498.39, "total_tokens": 18874368}
+{"current_steps": 10, "total_steps": 55, "loss": 0.6769, "lr": 4.6031338320779534e-05, "epoch": 0.17937219730941703, "percentage": 18.18, "elapsed_time": "0:27:56", "remaining_time": "2:05:43", "throughput": 12510.8, "total_tokens": 20971520}
+{"current_steps": 11, "total_steps": 55, "loss": 0.6873, "lr": 4.522542485937369e-05, "epoch": 0.19730941704035873, "percentage": 20.0, "elapsed_time": "0:30:42", "remaining_time": "2:02:49", "throughput": 12521.66, "total_tokens": 23068672}
+{"current_steps": 12, "total_steps": 55, "loss": 0.6643, "lr": 4.4353540265977064e-05, "epoch": 0.21524663677130046, "percentage": 21.82, "elapsed_time": "0:33:28", "remaining_time": "1:59:55", "throughput": 12532.39, "total_tokens": 25165824}
+{"current_steps": 13, "total_steps": 55, "loss": 0.6849, "lr": 4.341852844691012e-05, "epoch": 0.23318385650224216, "percentage": 23.64, "elapsed_time": "0:36:14", "remaining_time": "1:57:05", "throughput": 12536.36, "total_tokens": 27262976}
+{"current_steps": 14, "total_steps": 55, "loss": 0.6461, "lr": 4.242343921638234e-05, "epoch": 0.25112107623318386, "percentage": 25.45, "elapsed_time": "0:39:02", "remaining_time": "1:54:18", "throughput": 12536.26, "total_tokens": 29360128}
+{"current_steps": 15, "total_steps": 55, "loss": 0.6623, "lr": 4.137151834863213e-05, "epoch": 0.26905829596412556, "percentage": 27.27, "elapsed_time": "0:41:50", "remaining_time": "1:51:33", "throughput": 12532.13, "total_tokens": 31457280}
+{"current_steps": 16, "total_steps": 55, "loss": 0.6751, "lr": 4.0266196990885955e-05, "epoch": 0.28699551569506726, "percentage": 29.09, "elapsed_time": "0:44:37", "remaining_time": "1:48:47", "throughput": 12530.23, "total_tokens": 33554432}
+{"current_steps": 17, "total_steps": 55, "loss": 0.6472, "lr": 3.911108047166924e-05, "epoch": 0.30493273542600896, "percentage": 30.91, "elapsed_time": "0:47:24", "remaining_time": "1:45:58", "throughput": 12533.91, "total_tokens": 35651584}
+{"current_steps": 18, "total_steps": 55, "loss": 0.6728, "lr": 3.790993654097405e-05, "epoch": 0.32286995515695066, "percentage": 32.73, "elapsed_time": "0:50:10", "remaining_time": "1:43:08", "throughput": 12538.59, "total_tokens": 37748736}
+{"current_steps": 19, "total_steps": 55, "loss": 0.7017, "lr": 3.6666683080641846e-05, "epoch": 0.34080717488789236, "percentage": 34.55, "elapsed_time": "0:52:56", "remaining_time": "1:40:19", "throughput": 12542.68, "total_tokens": 39845888}
+{"current_steps": 20, "total_steps": 55, "loss": 0.6502, "lr": 3.5385375325047166e-05, "epoch": 0.35874439461883406, "percentage": 36.36, "elapsed_time": "0:55:43", "remaining_time": "1:37:30", "throughput": 12545.22, "total_tokens": 41943040}
+{"current_steps": 21, "total_steps": 55, "loss": 0.6476, "lr": 3.4070192633766025e-05, "epoch": 0.37668161434977576, "percentage": 38.18, "elapsed_time": "0:58:29", "remaining_time": "1:34:42", "throughput": 12548.26, "total_tokens": 44040192}
+{"current_steps": 22, "total_steps": 55, "loss": 0.6411, "lr": 3.272542485937369e-05, "epoch": 0.39461883408071746, "percentage": 40.0, "elapsed_time": "1:01:16", "remaining_time": "1:31:54", "throughput": 12550.81, "total_tokens": 46137344}
+{"current_steps": 23, "total_steps": 55, "loss": 0.6428, "lr": 3.135545835483718e-05, "epoch": 0.4125560538116592, "percentage": 41.82, "elapsed_time": "1:04:02", "remaining_time": "1:29:05", "throughput": 12553.59, "total_tokens": 48234496}
+{"current_steps": 24, "total_steps": 55, "loss": 0.6661, "lr": 2.996476166614364e-05, "epoch": 0.4304932735426009, "percentage": 43.64, "elapsed_time": "1:06:49", "remaining_time": "1:26:18", "throughput": 12553.66, "total_tokens": 50331648}
+{"current_steps": 25, "total_steps": 55, "loss": 0.6378, "lr": 2.8557870956832132e-05, "epoch": 0.4484304932735426, "percentage": 45.45, "elapsed_time": "1:09:35", "remaining_time": "1:23:30", "throughput": 12556.55, "total_tokens": 52428800}
+{"current_steps": 26, "total_steps": 55, "loss": 0.6532, "lr": 2.7139375211970996e-05, "epoch": 0.4663677130044843, "percentage": 47.27, "elapsed_time": "1:12:21", "remaining_time": "1:20:42", "throughput": 12559.06, "total_tokens": 54525952}
+{"current_steps": 27, "total_steps": 55, "loss": 0.6403, "lr": 2.5713901269842404e-05, "epoch": 0.484304932735426, "percentage": 49.09, "elapsed_time": "1:15:08", "remaining_time": "1:17:55", "throughput": 12558.03, "total_tokens": 56623104}
+{"current_steps": 28, "total_steps": 55, "loss": 0.6248, "lr": 2.42860987301576e-05, "epoch": 0.5022421524663677, "percentage": 50.91, "elapsed_time": "1:17:56", "remaining_time": "1:15:09", "throughput": 12555.75, "total_tokens": 58720256}
+{"current_steps": 29, "total_steps": 55, "loss": 0.6583, "lr": 2.2860624788029013e-05, "epoch": 0.5201793721973094, "percentage": 52.73, "elapsed_time": "1:20:43", "remaining_time": "1:12:22", "throughput": 12557.32, "total_tokens": 60817408}
+{"current_steps": 30, "total_steps": 55, "loss": 0.6579, "lr": 2.1442129043167874e-05, "epoch": 0.5381165919282511, "percentage": 54.55, "elapsed_time": "1:23:29", "remaining_time": "1:09:34", "throughput": 12558.75, "total_tokens": 62914560}
+{"current_steps": 31, "total_steps": 55, "loss": 0.6659, "lr": 2.003523833385637e-05, "epoch": 0.5560538116591929, "percentage": 56.36, "elapsed_time": "1:26:16", "remaining_time": "1:06:47", "throughput": 12560.18, "total_tokens": 65011712}
+{"current_steps": 32, "total_steps": 55, "loss": 0.6423, "lr": 1.8644541645162834e-05, "epoch": 0.5739910313901345, "percentage": 58.18, "elapsed_time": "1:29:02", "remaining_time": "1:04:00", "throughput": 12560.33, "total_tokens": 67108864}
+{"current_steps": 33, "total_steps": 55, "loss": 0.6509, "lr": 1.7274575140626318e-05, "epoch": 0.5919282511210763, "percentage": 60.0, "elapsed_time": "1:31:49", "remaining_time": "1:01:13", "throughput": 12560.22, "total_tokens": 69206016}
+{"current_steps": 34, "total_steps": 55, "loss": 0.6551, "lr": 1.5929807366233977e-05, "epoch": 0.6098654708520179, "percentage": 61.82, "elapsed_time": "1:34:37", "remaining_time": "0:58:26", "throughput": 12559.4, "total_tokens": 71303168}
+{"current_steps": 35, "total_steps": 55, "loss": 0.6232, "lr": 1.4614624674952842e-05, "epoch": 0.6278026905829597, "percentage": 63.64, "elapsed_time": "1:37:23", "remaining_time": "0:55:39", "throughput": 12560.48, "total_tokens": 73400320}
+{"current_steps": 36, "total_steps": 55, "loss": 0.6137, "lr": 1.3333316919358157e-05, "epoch": 0.6457399103139013, "percentage": 65.45, "elapsed_time": "1:40:10", "remaining_time": "0:52:52", "throughput": 12560.55, "total_tokens": 75497472}
+{"current_steps": 37, "total_steps": 55, "loss": 0.6426, "lr": 1.2090063459025955e-05, "epoch": 0.6636771300448431, "percentage": 67.27, "elapsed_time": "1:42:57", "remaining_time": "0:50:05", "throughput": 12560.88, "total_tokens": 77594624}
+{"current_steps": 38, "total_steps": 55, "loss": 0.6512, "lr": 1.0888919528330777e-05, "epoch": 0.6816143497757847, "percentage": 69.09, "elapsed_time": "1:45:44", "remaining_time": "0:47:18", "throughput": 12560.79, "total_tokens": 79691776}
+{"current_steps": 39, "total_steps": 55, "loss": 0.6269, "lr": 9.733803009114045e-06, "epoch": 0.6995515695067265, "percentage": 70.91, "elapsed_time": "1:48:31", "remaining_time": "0:44:31", "throughput": 12561.62, "total_tokens": 81788928}
+{"current_steps": 40, "total_steps": 55, "loss": 0.6201, "lr": 8.628481651367876e-06, "epoch": 0.7174887892376681, "percentage": 72.73, "elapsed_time": "1:51:17", "remaining_time": "0:41:43", "throughput": 12563.27, "total_tokens": 83886080}
+{"current_steps": 41, "total_steps": 55, "loss": 0.642, "lr": 7.576560783617668e-06, "epoch": 0.7354260089686099, "percentage": 74.55, "elapsed_time": "1:54:03", "remaining_time": "0:38:56", "throughput": 12563.83, "total_tokens": 85983232}
+{"current_steps": 42, "total_steps": 55, "loss": 0.648, "lr": 6.5814715530898745e-06, "epoch": 0.7533632286995515, "percentage": 76.36, "elapsed_time": "1:56:50", "remaining_time": "0:36:09", "throughput": 12564.32, "total_tokens": 88080384}
+{"current_steps": 43, "total_steps": 55, "loss": 0.6442, "lr": 5.646459734022938e-06, "epoch": 0.7713004484304933, "percentage": 78.18, "elapsed_time": "1:59:37", "remaining_time": "0:33:22", "throughput": 12564.19, "total_tokens": 90177536}
+{"current_steps": 44, "total_steps": 55, "loss": 0.6488, "lr": 4.7745751406263165e-06, "epoch": 0.7892376681614349, "percentage": 80.0, "elapsed_time": "2:02:24", "remaining_time": "0:30:36", "throughput": 12563.71, "total_tokens": 92274688}
+{"current_steps": 45, "total_steps": 55, "loss": 0.65, "lr": 3.968661679220468e-06, "epoch": 0.8071748878923767, "percentage": 81.82, "elapsed_time": "2:05:11", "remaining_time": "0:27:49", "throughput": 12563.43, "total_tokens": 94371840}
+{"current_steps": 46, "total_steps": 55, "loss": 0.6584, "lr": 3.2313480720055745e-06, "epoch": 0.8251121076233184, "percentage": 83.64, "elapsed_time": "2:07:58", "remaining_time": "0:25:02", "throughput": 12563.12, "total_tokens": 96468992}
+{"current_steps": 47, "total_steps": 55, "loss": 0.6392, "lr": 2.565039282716045e-06, "epoch": 0.8430493273542601, "percentage": 85.45, "elapsed_time": "2:10:45", "remaining_time": "0:22:15", "throughput": 12562.87, "total_tokens": 98566144}
+{"current_steps": 48, "total_steps": 55, "loss": 0.6524, "lr": 1.97190867212875e-06, "epoch": 0.8609865470852018, "percentage": 87.27, "elapsed_time": "2:13:32", "remaining_time": "0:19:28", "throughput": 12562.77, "total_tokens": 100663296}
+{"current_steps": 49, "total_steps": 55, "loss": 0.6276, "lr": 1.4538909090118846e-06, "epoch": 0.8789237668161435, "percentage": 89.09, "elapsed_time": "2:16:19", "remaining_time": "0:16:41", "throughput": 12562.47, "total_tokens": 102760448}
+{"current_steps": 50, "total_steps": 55, "loss": 0.6282, "lr": 1.0126756596375686e-06, "epoch": 0.8968609865470852, "percentage": 90.91, "elapsed_time": "2:19:07", "remaining_time": "0:13:54", "throughput": 12562.14, "total_tokens": 104857600}
+{"current_steps": 51, "total_steps": 55, "loss": 0.6344, "lr": 6.497020764416633e-07, "epoch": 0.9147982062780269, "percentage": 92.73, "elapsed_time": "2:21:54", "remaining_time": "0:11:07", "throughput": 12561.89, "total_tokens": 106954752}
+{"current_steps": 52, "total_steps": 55, "loss": 0.6464, "lr": 3.6615410380767544e-07, "epoch": 0.9327354260089686, "percentage": 94.55, "elapsed_time": "2:24:41", "remaining_time": "0:08:20", "throughput": 12561.72, "total_tokens": 109051904}
+{"current_steps": 53, "total_steps": 55, "loss": 0.6253, "lr": 1.6295661628624447e-07, "epoch": 0.9506726457399103, "percentage": 96.36, "elapsed_time": "2:27:28", "remaining_time": "0:05:33", "throughput": 12561.53, "total_tokens": 111149056}
+{"current_steps": 54, "total_steps": 55, "loss": 0.6375, "lr": 4.07724018466088e-08, "epoch": 0.968609865470852, "percentage": 98.18, "elapsed_time": "2:30:15", "remaining_time": "0:02:46", "throughput": 12561.27, "total_tokens": 113246208}
+{"current_steps": 55, "total_steps": 55, "loss": 0.6419, "lr": 0.0, "epoch": 0.9865470852017937, "percentage": 100.0, "elapsed_time": "2:33:02", "remaining_time": "0:00:00", "throughput": 12561.25, "total_tokens": 115343360}
+{"current_steps": 55, "total_steps": 55, "epoch": 0.9865470852017937, "percentage": 100.0, "elapsed_time": "2:33:27", "remaining_time": "0:00:00", "throughput": 12527.75, "total_tokens": 115343360}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,483 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9865470852017937,
+  "eval_steps": 500,
+  "global_step": 55,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.017937219730941704,
+      "grad_norm": 0.6536183953285217,
+      "learning_rate": 4.995922759815339e-05,
+      "loss": 0.8371,
+      "num_input_tokens_seen": 2097152,
+      "step": 1
+    },
+    {
+      "epoch": 0.03587443946188341,
+      "grad_norm": 0.517680823802948,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.7804,
+      "num_input_tokens_seen": 4194304,
+      "step": 2
+    },
+    {
+      "epoch": 0.053811659192825115,
+      "grad_norm": 0.4423481225967407,
+      "learning_rate": 4.963384589619233e-05,
+      "loss": 0.7695,
+      "num_input_tokens_seen": 6291456,
+      "step": 3
+    },
+    {
+      "epoch": 0.07174887892376682,
+      "grad_norm": 0.39828750491142273,
+      "learning_rate": 4.935029792355834e-05,
+      "loss": 0.7419,
+      "num_input_tokens_seen": 8388608,
+      "step": 4
+    },
+    {
+      "epoch": 0.08968609865470852,
+      "grad_norm": 0.31201115250587463,
+      "learning_rate": 4.898732434036244e-05,
+      "loss": 0.7166,
+      "num_input_tokens_seen": 10485760,
+      "step": 5
+    },
+    {
+      "epoch": 0.10762331838565023,
+      "grad_norm": 0.2536958158016205,
+      "learning_rate": 4.854610909098812e-05,
+      "loss": 0.7194,
+      "num_input_tokens_seen": 12582912,
+      "step": 6
+    },
+    {
+      "epoch": 0.12556053811659193,
+      "grad_norm": 0.2193588763475418,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.6975,
+      "num_input_tokens_seen": 14680064,
+      "step": 7
+    },
+    {
+      "epoch": 0.14349775784753363,
+      "grad_norm": 0.18916621804237366,
+      "learning_rate": 4.743496071728396e-05,
+      "loss": 0.7168,
+      "num_input_tokens_seen": 16777216,
+      "step": 8
+    },
+    {
+      "epoch": 0.16143497757847533,
+      "grad_norm": 0.1561172604560852,
+      "learning_rate": 4.6768651927994434e-05,
+      "loss": 0.6707,
+      "num_input_tokens_seen": 18874368,
+      "step": 9
+    },
+    {
+      "epoch": 0.17937219730941703,
+      "grad_norm": 0.12857139110565186,
+      "learning_rate": 4.6031338320779534e-05,
+      "loss": 0.6769,
+      "num_input_tokens_seen": 20971520,
+      "step": 10
+    },
+    {
+      "epoch": 0.19730941704035873,
+      "grad_norm": 0.11340289562940598,
+      "learning_rate": 4.522542485937369e-05,
+      "loss": 0.6873,
+      "num_input_tokens_seen": 23068672,
+      "step": 11
+    },
+    {
+      "epoch": 0.21524663677130046,
+      "grad_norm": 0.10658581554889679,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.6643,
+      "num_input_tokens_seen": 25165824,
+      "step": 12
+    },
+    {
+      "epoch": 0.23318385650224216,
+      "grad_norm": 0.08937722444534302,
+      "learning_rate": 4.341852844691012e-05,
+      "loss": 0.6849,
+      "num_input_tokens_seen": 27262976,
+      "step": 13
+    },
+    {
+      "epoch": 0.25112107623318386,
+      "grad_norm": 0.07756289094686508,
+      "learning_rate": 4.242343921638234e-05,
+      "loss": 0.6461,
+      "num_input_tokens_seen": 29360128,
+      "step": 14
+    },
+    {
+      "epoch": 0.26905829596412556,
+      "grad_norm": 0.07581546157598495,
+      "learning_rate": 4.137151834863213e-05,
+      "loss": 0.6623,
+      "num_input_tokens_seen": 31457280,
+      "step": 15
+    },
+    {
+      "epoch": 0.28699551569506726,
+      "grad_norm": 0.07386067509651184,
+      "learning_rate": 4.0266196990885955e-05,
+      "loss": 0.6751,
+      "num_input_tokens_seen": 33554432,
+      "step": 16
+    },
+    {
+      "epoch": 0.30493273542600896,
+      "grad_norm": 0.06293580681085587,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.6472,
+      "num_input_tokens_seen": 35651584,
+      "step": 17
+    },
+    {
+      "epoch": 0.32286995515695066,
+      "grad_norm": 0.06199085712432861,
+      "learning_rate": 3.790993654097405e-05,
+      "loss": 0.6728,
+      "num_input_tokens_seen": 37748736,
+      "step": 18
+    },
+    {
+      "epoch": 0.34080717488789236,
+      "grad_norm": 0.060734592378139496,
+      "learning_rate": 3.6666683080641846e-05,
+      "loss": 0.7017,
+      "num_input_tokens_seen": 39845888,
+      "step": 19
+    },
+    {
+      "epoch": 0.35874439461883406,
+      "grad_norm": 0.05623164027929306,
+      "learning_rate": 3.5385375325047166e-05,
+      "loss": 0.6502,
+      "num_input_tokens_seen": 41943040,
+      "step": 20
+    },
+    {
+      "epoch": 0.37668161434977576,
+      "grad_norm": 0.0574677549302578,
+      "learning_rate": 3.4070192633766025e-05,
+      "loss": 0.6476,
+      "num_input_tokens_seen": 44040192,
+      "step": 21
+    },
+    {
+      "epoch": 0.39461883408071746,
+      "grad_norm": 0.05185185372829437,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.6411,
+      "num_input_tokens_seen": 46137344,
+      "step": 22
+    },
+    {
+      "epoch": 0.4125560538116592,
+      "grad_norm": 0.05139186978340149,
+      "learning_rate": 3.135545835483718e-05,
+      "loss": 0.6428,
+      "num_input_tokens_seen": 48234496,
+      "step": 23
+    },
+    {
+      "epoch": 0.4304932735426009,
+      "grad_norm": 0.050159115344285965,
+      "learning_rate": 2.996476166614364e-05,
+      "loss": 0.6661,
+      "num_input_tokens_seen": 50331648,
+      "step": 24
+    },
+    {
+      "epoch": 0.4484304932735426,
+      "grad_norm": 0.04851464927196503,
+      "learning_rate": 2.8557870956832132e-05,
+      "loss": 0.6378,
+      "num_input_tokens_seen": 52428800,
+      "step": 25
+    },
+    {
+      "epoch": 0.4663677130044843,
+      "grad_norm": 0.04896726831793785,
+      "learning_rate": 2.7139375211970996e-05,
+      "loss": 0.6532,
+      "num_input_tokens_seen": 54525952,
+      "step": 26
+    },
+    {
+      "epoch": 0.484304932735426,
+      "grad_norm": 0.04698600620031357,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.6403,
+      "num_input_tokens_seen": 56623104,
+      "step": 27
+    },
+    {
+      "epoch": 0.5022421524663677,
+      "grad_norm": 0.048034097999334335,
+      "learning_rate": 2.42860987301576e-05,
+      "loss": 0.6248,
+      "num_input_tokens_seen": 58720256,
+      "step": 28
+    },
+    {
+      "epoch": 0.5201793721973094,
+      "grad_norm": 0.044828303158283234,
+      "learning_rate": 2.2860624788029013e-05,
+      "loss": 0.6583,
+      "num_input_tokens_seen": 60817408,
+      "step": 29
+    },
+    {
+      "epoch": 0.5381165919282511,
+      "grad_norm": 0.04563640430569649,
+      "learning_rate": 2.1442129043167874e-05,
+      "loss": 0.6579,
+      "num_input_tokens_seen": 62914560,
+      "step": 30
+    },
+    {
+      "epoch": 0.5560538116591929,
+      "grad_norm": 0.044318560510873795,
+      "learning_rate": 2.003523833385637e-05,
+      "loss": 0.6659,
+      "num_input_tokens_seen": 65011712,
+      "step": 31
+    },
+    {
+      "epoch": 0.5739910313901345,
+      "grad_norm": 0.04331167787313461,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.6423,
+      "num_input_tokens_seen": 67108864,
+      "step": 32
+    },
+    {
+      "epoch": 0.5919282511210763,
+      "grad_norm": 0.04475367069244385,
+      "learning_rate": 1.7274575140626318e-05,
+      "loss": 0.6509,
+      "num_input_tokens_seen": 69206016,
+      "step": 33
+    },
+    {
+      "epoch": 0.6098654708520179,
+      "grad_norm": 0.045547887682914734,
+      "learning_rate": 1.5929807366233977e-05,
+      "loss": 0.6551,
+      "num_input_tokens_seen": 71303168,
+      "step": 34
+    },
+    {
+      "epoch": 0.6278026905829597,
+      "grad_norm": 0.043985530734062195,
+      "learning_rate": 1.4614624674952842e-05,
+      "loss": 0.6232,
+      "num_input_tokens_seen": 73400320,
+      "step": 35
+    },
+    {
+      "epoch": 0.6457399103139013,
+      "grad_norm": 0.0414094403386116,
+      "learning_rate": 1.3333316919358157e-05,
+      "loss": 0.6137,
+      "num_input_tokens_seen": 75497472,
+      "step": 36
+    },
+    {
+      "epoch": 0.6636771300448431,
+      "grad_norm": 0.041019294410943985,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.6426,
+      "num_input_tokens_seen": 77594624,
+      "step": 37
+    },
+    {
+      "epoch": 0.6816143497757847,
+      "grad_norm": 0.04383592680096626,
+      "learning_rate": 1.0888919528330777e-05,
+      "loss": 0.6512,
+      "num_input_tokens_seen": 79691776,
+      "step": 38
+    },
+    {
+      "epoch": 0.6995515695067265,
+      "grad_norm": 0.040539514273405075,
+      "learning_rate": 9.733803009114045e-06,
+      "loss": 0.6269,
+      "num_input_tokens_seen": 81788928,
+      "step": 39
+    },
+    {
+      "epoch": 0.7174887892376681,
+      "grad_norm": 0.04238974675536156,
+      "learning_rate": 8.628481651367876e-06,
+      "loss": 0.6201,
+      "num_input_tokens_seen": 83886080,
+      "step": 40
+    },
+    {
+      "epoch": 0.7354260089686099,
+      "grad_norm": 0.04115669056773186,
+      "learning_rate": 7.576560783617668e-06,
+      "loss": 0.642,
+      "num_input_tokens_seen": 85983232,
+      "step": 41
+    },
+    {
+      "epoch": 0.7533632286995515,
+      "grad_norm": 0.04178008437156677,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.648,
+      "num_input_tokens_seen": 88080384,
+      "step": 42
+    },
+    {
+      "epoch": 0.7713004484304933,
+      "grad_norm": 0.04329155012965202,
+      "learning_rate": 5.646459734022938e-06,
+      "loss": 0.6442,
+      "num_input_tokens_seen": 90177536,
+      "step": 43
+    },
+    {
+      "epoch": 0.7892376681614349,
+      "grad_norm": 0.043740272521972656,
+      "learning_rate": 4.7745751406263165e-06,
+      "loss": 0.6488,
+      "num_input_tokens_seen": 92274688,
+      "step": 44
+    },
+    {
+      "epoch": 0.8071748878923767,
+      "grad_norm": 0.04263562709093094,
+      "learning_rate": 3.968661679220468e-06,
+      "loss": 0.65,
+      "num_input_tokens_seen": 94371840,
+      "step": 45
+    },
+    {
+      "epoch": 0.8251121076233184,
+      "grad_norm": 0.041693028062582016,
+      "learning_rate": 3.2313480720055745e-06,
+      "loss": 0.6584,
+      "num_input_tokens_seen": 96468992,
+      "step": 46
+    },
+    {
+      "epoch": 0.8430493273542601,
+      "grad_norm": 0.04151754826307297,
+      "learning_rate": 2.565039282716045e-06,
+      "loss": 0.6392,
+      "num_input_tokens_seen": 98566144,
+      "step": 47
+    },
+    {
+      "epoch": 0.8609865470852018,
+      "grad_norm": 0.04260968416929245,
+      "learning_rate": 1.97190867212875e-06,
+      "loss": 0.6524,
+      "num_input_tokens_seen": 100663296,
+      "step": 48
+    },
+    {
+      "epoch": 0.8789237668161435,
+      "grad_norm": 0.04022514820098877,
+      "learning_rate": 1.4538909090118846e-06,
+      "loss": 0.6276,
+      "num_input_tokens_seen": 102760448,
+      "step": 49
+    },
+    {
+      "epoch": 0.8968609865470852,
+      "grad_norm": 0.039072513580322266,
+      "learning_rate": 1.0126756596375686e-06,
+      "loss": 0.6282,
+      "num_input_tokens_seen": 104857600,
+      "step": 50
+    },
+    {
+      "epoch": 0.9147982062780269,
+      "grad_norm": 0.03952722251415253,
+      "learning_rate": 6.497020764416633e-07,
+      "loss": 0.6344,
+      "num_input_tokens_seen": 106954752,
+      "step": 51
+    },
+    {
+      "epoch": 0.9327354260089686,
+      "grad_norm": 0.04045777767896652,
+      "learning_rate": 3.6615410380767544e-07,
+      "loss": 0.6464,
+      "num_input_tokens_seen": 109051904,
+      "step": 52
+    },
+    {
+      "epoch": 0.9506726457399103,
+      "grad_norm": 0.03984501212835312,
+      "learning_rate": 1.6295661628624447e-07,
+      "loss": 0.6253,
+      "num_input_tokens_seen": 111149056,
+      "step": 53
+    },
+    {
+      "epoch": 0.968609865470852,
+      "grad_norm": 0.040761884301900864,
+      "learning_rate": 4.07724018466088e-08,
+      "loss": 0.6375,
+      "num_input_tokens_seen": 113246208,
+      "step": 54
+    },
+    {
+      "epoch": 0.9865470852017937,
+      "grad_norm": 0.04142209142446518,
+      "learning_rate": 0.0,
+      "loss": 0.6419,
+      "num_input_tokens_seen": 115343360,
+      "step": 55
+    },
+    {
+      "epoch": 0.9865470852017937,
+      "num_input_tokens_seen": 115343360,
+      "step": 55,
+      "total_flos": 5.104238176512246e+18,
+      "train_loss": 0.6637221011248502,
+      "train_runtime": 9208.1472,
+      "train_samples_per_second": 3.097,
+      "train_steps_per_second": 0.006
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 55,
+  "num_input_tokens_seen": 115343360,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.104238176512246e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e164dab6761d5ccf7f25c775d976260379deb4b28f82ec691fefd0f00aae4acd
+size 5688

training_args.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+apollo_rank: 256
+apollo_scale: 1
+apollo_target: all
+apollo_update_interval: 200
+bf16: true
+cutoff_len: 4096
+dataset: codes3_query_filtered_330k_nlx
+dataset_dir: data
+ddp_timeout: 180000000
+do_train: true
+enable_liger_kernel: true
+finetuning_type: freeze
+flash_attn: auto
+freeze_trainable_layers: 2
+freeze_trainable_modules: all
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+learning_rate: 5.0e-05
+logging_steps: 1
+lr_scheduler_type: cosine
+max_grad_norm: 1.0
+max_samples: 50000000
+model_name_or_path: infly/OpenCoder-8B-Instruct
+neat_packing: true
+num_train_epochs: 1.0
+output_dir: saves/OpenCoder-8B-Instruct/freeze/opencoder-nlx-330k
+packing: true
+per_device_train_batch_size: 16
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: none
+rope_scaling: llama3
+save_steps: 500
+stage: sft
+template: opencoder
+trust_remote_code: true
+use_apollo: true
+use_llama_pro: true
+warmup_steps: 0

training_loss.png ADDED Viewed