iSolver-AI commited on
Commit
a507972
·
verified ·
1 Parent(s): 22de4f1

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +63 -53
config.json CHANGED
@@ -1,66 +1,76 @@
1
  {
2
  "architectures": [
3
- "DeepseekV32ForCausalLM"
4
  ],
5
- "attention_bias": false,
6
  "attention_dropout": 0.0,
7
- "bos_token_id": 0,
8
- "eos_token_id": 1,
9
- "ep_size": 1,
10
- "first_k_dense_replace": 3,
11
  "hidden_act": "silu",
12
- "hidden_size": 7168,
13
- "index_head_dim": 128,
14
- "index_n_heads": 64,
15
- "index_topk": 2048,
16
  "initializer_range": 0.02,
17
- "intermediate_size": 18432,
18
- "kv_lora_rank": 512,
19
- "max_position_embeddings": 163840,
20
- "model_type": "deepseek_v32",
21
- "moe_intermediate_size": 2048,
22
- "moe_layer_freq": 1,
23
- "n_group": 8,
24
- "n_routed_experts": 256,
25
- "n_shared_experts": 1,
26
- "norm_topk_prob": true,
27
- "num_attention_heads": 128,
28
- "num_experts_per_tok": 8,
29
- "num_hidden_layers": 61,
30
- "num_key_value_heads": 128,
31
- "num_nextn_predict_layers": 1,
32
- "q_lora_rank": 1536,
33
- "qk_nope_head_dim": 128,
34
- "qk_rope_head_dim": 64,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  "quantization_config": {
36
- "activation_scheme": "dynamic",
37
- "fmt": "e4m3",
38
- "quant_method": "fp16",
39
- "scale_fmt": "ue8m0",
40
- "weight_block_size": [
41
- 128,
42
- 128
43
- ]
44
  },
45
- "rms_norm_eps": 1e-06,
46
  "rope_scaling": {
47
- "beta_fast": 32,
48
- "beta_slow": 1,
49
- "factor": 40,
50
- "mscale": 1.0,
51
- "mscale_all_dim": 1.0,
52
  "original_max_position_embeddings": 4096,
53
- "type": "yarn"
 
54
  },
55
- "rope_theta": 10000,
56
- "routed_scaling_factor": 2.5,
57
- "scoring_func": "sigmoid",
 
58
  "tie_word_embeddings": false,
59
- "topk_group": 4,
60
- "topk_method": "noaux_tc",
61
- "torch_dtype": "bfloat16",
62
- "transformers_version": "4.44.2",
63
  "use_cache": true,
64
- "v_head_dim": 128,
65
- "vocab_size": 129280
66
- }
 
1
  {
2
  "architectures": [
3
+ "GptOssForCausalLM"
4
  ],
5
+ "attention_bias": true,
6
  "attention_dropout": 0.0,
7
+ "eos_token_id": 200002,
8
+ "experts_per_token": 4,
9
+ "head_dim": 64,
 
10
  "hidden_act": "silu",
11
+ "hidden_size": 2880,
12
+ "initial_context_length": 4096,
 
 
13
  "initializer_range": 0.02,
14
+ "intermediate_size": 2880,
15
+ "layer_types": [
16
+ "sliding_attention",
17
+ "full_attention",
18
+ "sliding_attention",
19
+ "full_attention",
20
+ "sliding_attention",
21
+ "full_attention",
22
+ "sliding_attention",
23
+ "full_attention",
24
+ "sliding_attention",
25
+ "full_attention",
26
+ "sliding_attention",
27
+ "full_attention",
28
+ "sliding_attention",
29
+ "full_attention",
30
+ "sliding_attention",
31
+ "full_attention",
32
+ "sliding_attention",
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "full_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "full_attention"
40
+ ],
41
+ "max_position_embeddings": 131072,
42
+ "model_type": "gpt_oss",
43
+ "num_attention_heads": 64,
44
+ "num_experts_per_tok": 4,
45
+ "num_hidden_layers": 24,
46
+ "num_key_value_heads": 8,
47
+ "num_local_experts": 32,
48
+ "output_router_logits": false,
49
+ "pad_token_id": 199999,
50
  "quantization_config": {
51
+ "modules_to_not_convert": [
52
+ "model.layers.*.self_attn",
53
+ "model.layers.*.mlp.router",
54
+ "model.embed_tokens",
55
+ "lm_head"
56
+ ],
57
+ "quant_method": "mxfp4"
 
58
  },
59
+ "rms_norm_eps": 1e-05,
60
  "rope_scaling": {
61
+ "beta_fast": 32.0,
62
+ "beta_slow": 1.0,
63
+ "factor": 32.0,
 
 
64
  "original_max_position_embeddings": 4096,
65
+ "rope_type": "yarn",
66
+ "truncate": false
67
  },
68
+ "rope_theta": 150000,
69
+ "router_aux_loss_coef": 0.9,
70
+ "sliding_window": 128,
71
+ "swiglu_limit": 7.0,
72
  "tie_word_embeddings": false,
73
+ "transformers_version": "4.55.0.dev0",
 
 
 
74
  "use_cache": true,
75
+ "vocab_size": 201088
76
+ }