SystemAdmin123 commited on Feb 2

Commit

4660693

verified ·

1 Parent(s): 13c8326

Training in progress, step 160, checkpoint

Browse files

Files changed (21) hide show

last-checkpoint/added_tokens.json +4 -0
last-checkpoint/config.json +36 -0
last-checkpoint/generation_config.json +5 -0
last-checkpoint/merges.txt +0 -0
last-checkpoint/model.safetensors +3 -0
last-checkpoint/optimizer.pt +3 -0
last-checkpoint/rng_state_0.pth +3 -0
last-checkpoint/rng_state_1.pth +3 -0
last-checkpoint/rng_state_2.pth +3 -0
last-checkpoint/rng_state_3.pth +3 -0
last-checkpoint/rng_state_4.pth +3 -0
last-checkpoint/rng_state_5.pth +3 -0
last-checkpoint/rng_state_6.pth +3 -0
last-checkpoint/rng_state_7.pth +3 -0
last-checkpoint/scheduler.pt +3 -0
last-checkpoint/special_tokens_map.json +30 -0
last-checkpoint/tokenizer.json +0 -0
last-checkpoint/tokenizer_config.json +207 -0
last-checkpoint/trainer_state.json +217 -0
last-checkpoint/training_args.bin +3 -0
last-checkpoint/vocab.json +0 -0

last-checkpoint/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<|im_end|>": 100279,
+  "<|im_start|>": 100278
+}

last-checkpoint/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_name_or_path": "katuni4ka/tiny-random-dbrx",
+  "architectures": [
+    "DbrxForCausalLM"
+  ],
+  "attn_config": {
+    "clip_qkv": 8,
+    "kv_n_heads": 2,
+    "model_type": "",
+    "rope_theta": 500000
+  },
+  "d_model": 8,
+  "emb_pdrop": 0.0,
+  "ffn_config": {
+    "ffn_hidden_size": 8,
+    "model_type": "",
+    "moe_jitter_eps": 0,
+    "moe_loss_weight": 0.05,
+    "moe_num_experts": 16,
+    "moe_top_k": 4
+  },
+  "initializer_range": 0.02,
+  "max_seq_len": 32768,
+  "model_type": "dbrx",
+  "n_heads": 4,
+  "n_layers": 2,
+  "num_key_value_heads": 2,
+  "output_router_logits": false,
+  "resid_pdrop": 0.0,
+  "router_aux_loss_coef": 0.05,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.1",
+  "use_cache": false,
+  "vocab_size": 100280
+}

last-checkpoint/generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "do_sample": true,
+  "transformers_version": "4.48.1"
+}

last-checkpoint/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

last-checkpoint/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e3c84de2c2dd4c70b6daa9dd9a6276698a4a3c5f16ffb3e5c9e12e4bfa9c796
+size 3224728

last-checkpoint/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18b540f097f038f52f8ea589eb6d1c7362509ff28117e7aa32d6c1909c96b585
+size 3328626

last-checkpoint/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13c9ad11f70cb5d30587ede3f71f44d6bd22de048e7dea94626b103172f1a451
+size 15984

last-checkpoint/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4db4ceaf3fc99f5683e1e54a0ce72b34e8a39029b46d2d3a4f3467f7123b4cbc
+size 15984

last-checkpoint/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:794e4e0063bacc6479402dbf5a29a9ee7054a6d28bf769c2f8ef9992c3fb0728
+size 15984

last-checkpoint/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4e8f6076b3b73998e368cd6a3855393db7b4f440f465de797edfe91a65c0702
+size 15984

last-checkpoint/rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbc8c6e30345a43def1fa8fe35027e62b6748496e4f328814c4fb13285a49364
+size 15984

last-checkpoint/rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19ddf1325b3ee86d34158861780505e1a1100feb6601d24cc30e33f3abf94ace
+size 15984

last-checkpoint/rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b413e4fea4da9a1f1386e7176b0bbb9ac2047d78224fa955d6e2b7332f1869dd
+size 15984

last-checkpoint/rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3dfadee9a2b306408da618c8f99a7ad319a5ceefd44df0c0558c60a1ad8e9d89
+size 15984

last-checkpoint/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f0c7ff54b85ffef1dcbffa5d8d256d7b7e02cf6f2a611b338e53d605c1ee098
+size 1064

last-checkpoint/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

last-checkpoint/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

last-checkpoint/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "100256": {
+      "content": "<||_unused_0_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100257": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100258": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100259": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100260": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100261": {
+      "content": "<||_unused_1_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100262": {
+      "content": "<||_unused_2_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100263": {
+      "content": "<||_unused_3_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100264": {
+      "content": "<||_unused_4_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100265": {
+      "content": "<||_unused_5_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100266": {
+      "content": "<||_unused_6_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100267": {
+      "content": "<||_unused_7_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100268": {
+      "content": "<||_unused_8_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100269": {
+      "content": "<||_unused_9_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100270": {
+      "content": "<||_unused_10_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100271": {
+      "content": "<||_unused_11_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100272": {
+      "content": "<||_unused_12_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100273": {
+      "content": "<||_unused_13_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100274": {
+      "content": "<||_unused_14_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100275": {
+      "content": "<||_unused_15_||>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100276": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100277": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100278": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100279": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|pad|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "use_fast": true
+}

last-checkpoint/trainer_state.json ADDED Viewed

	@@ -0,0 +1,217 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1893491124260355,
+  "eval_steps": 20,
+  "global_step": 160,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.001183431952662722,
+      "eval_loss": 11.5,
+      "eval_runtime": 3.3956,
+      "eval_samples_per_second": 442.336,
+      "eval_steps_per_second": 27.683,
+      "step": 1
+    },
+    {
+      "epoch": 0.011834319526627219,
+      "grad_norm": 0.004608154296875,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 11.5,
+      "step": 10
+    },
+    {
+      "epoch": 0.023668639053254437,
+      "grad_norm": 0.0023193359375,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 11.5,
+      "step": 20
+    },
+    {
+      "epoch": 0.023668639053254437,
+      "eval_loss": 11.5,
+      "eval_runtime": 3.3351,
+      "eval_samples_per_second": 450.363,
+      "eval_steps_per_second": 28.185,
+      "step": 20
+    },
+    {
+      "epoch": 0.03550295857988166,
+      "grad_norm": 0.00604248046875,
+      "learning_rate": 4.8e-05,
+      "loss": 11.5,
+      "step": 30
+    },
+    {
+      "epoch": 0.047337278106508875,
+      "grad_norm": 0.0032501220703125,
+      "learning_rate": 6.400000000000001e-05,
+      "loss": 11.5,
+      "step": 40
+    },
+    {
+      "epoch": 0.047337278106508875,
+      "eval_loss": 11.5,
+      "eval_runtime": 3.3126,
+      "eval_samples_per_second": 453.421,
+      "eval_steps_per_second": 28.377,
+      "step": 40
+    },
+    {
+      "epoch": 0.05917159763313609,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 8e-05,
+      "loss": 11.5,
+      "step": 50
+    },
+    {
+      "epoch": 0.07100591715976332,
+      "grad_norm": 0.004241943359375,
+      "learning_rate": 9.6e-05,
+      "loss": 11.5,
+      "step": 60
+    },
+    {
+      "epoch": 0.07100591715976332,
+      "eval_loss": 11.5,
+      "eval_runtime": 3.3352,
+      "eval_samples_per_second": 450.349,
+      "eval_steps_per_second": 28.184,
+      "step": 60
+    },
+    {
+      "epoch": 0.08284023668639054,
+      "grad_norm": 0.00238037109375,
+      "learning_rate": 0.00011200000000000001,
+      "loss": 11.5,
+      "step": 70
+    },
+    {
+      "epoch": 0.09467455621301775,
+      "grad_norm": 0.006256103515625,
+      "learning_rate": 0.00012800000000000002,
+      "loss": 11.5,
+      "step": 80
+    },
+    {
+      "epoch": 0.09467455621301775,
+      "eval_loss": 11.5,
+      "eval_runtime": 3.3585,
+      "eval_samples_per_second": 447.222,
+      "eval_steps_per_second": 27.989,
+      "step": 80
+    },
+    {
+      "epoch": 0.10650887573964497,
+      "grad_norm": 0.003936767578125,
+      "learning_rate": 0.000144,
+      "loss": 11.5,
+      "step": 90
+    },
+    {
+      "epoch": 0.11834319526627218,
+      "grad_norm": 0.0123291015625,
+      "learning_rate": 0.00016,
+      "loss": 11.5,
+      "step": 100
+    },
+    {
+      "epoch": 0.11834319526627218,
+      "eval_loss": 11.5,
+      "eval_runtime": 3.3221,
+      "eval_samples_per_second": 452.122,
+      "eval_steps_per_second": 28.295,
+      "step": 100
+    },
+    {
+      "epoch": 0.1301775147928994,
+      "grad_norm": 0.00634765625,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 11.5,
+      "step": 110
+    },
+    {
+      "epoch": 0.14201183431952663,
+      "grad_norm": 0.005706787109375,
+      "learning_rate": 0.000192,
+      "loss": 11.5,
+      "step": 120
+    },
+    {
+      "epoch": 0.14201183431952663,
+      "eval_loss": 11.5,
+      "eval_runtime": 3.328,
+      "eval_samples_per_second": 451.328,
+      "eval_steps_per_second": 28.246,
+      "step": 120
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "grad_norm": 0.01129150390625,
+      "learning_rate": 0.0001999978128380225,
+      "loss": 11.5,
+      "step": 130
+    },
+    {
+      "epoch": 0.16568047337278108,
+      "grad_norm": 0.0162353515625,
+      "learning_rate": 0.0001999803161162393,
+      "loss": 11.5,
+      "step": 140
+    },
+    {
+      "epoch": 0.16568047337278108,
+      "eval_loss": 11.5,
+      "eval_runtime": 3.3864,
+      "eval_samples_per_second": 443.539,
+      "eval_steps_per_second": 27.758,
+      "step": 140
+    },
+    {
+      "epoch": 0.17751479289940827,
+      "grad_norm": 0.0267333984375,
+      "learning_rate": 0.00019994532573409262,
+      "loss": 11.5,
+      "step": 150
+    },
+    {
+      "epoch": 0.1893491124260355,
+      "grad_norm": 0.030029296875,
+      "learning_rate": 0.00019989284781388617,
+      "loss": 11.5,
+      "step": 160
+    },
+    {
+      "epoch": 0.1893491124260355,
+      "eval_loss": 11.5,
+      "eval_runtime": 3.35,
+      "eval_samples_per_second": 448.364,
+      "eval_steps_per_second": 28.06,
+      "step": 160
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 40,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 25470836408320.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

last-checkpoint/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4193abd9d3c8f06a0b72e6601d6500a541caf61561918ea14e1ccfe5bac411cf
+size 8312

last-checkpoint/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff