JingzeShi commited on
Commit
ba181b7
Β·
verified Β·
1 Parent(s): a9efcf2

Update configuration_doge.py

Browse files
Files changed (1) hide show
  1. configuration_doge.py +52 -38
configuration_doge.py CHANGED
@@ -5,10 +5,9 @@
5
  # modular_doge.py file directly. One of our CI enforces this.
6
  # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
  # coding=utf-8
8
- # Copyright 2024 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
9
  #
10
- # This code is based on the Wonderful Matrices paper implementation.
11
- # The Doge family of small language models is trained by Jingze Shi.
12
  #
13
  # Licensed under the Apache License, Version 2.0 (the "License");
14
  # you may not use this file except in compliance with the License.
@@ -28,22 +27,20 @@ from transformers.modeling_rope_utils import rope_config_validation
28
  class DogeConfig(PretrainedConfig):
29
  r"""
30
  This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
31
- model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-20M](https://huggingface.co/SmallDoge/Doge-20M).
32
 
33
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
34
  documentation from [`PretrainedConfig`] for more information.
35
 
36
  Args:
37
  vocab_size (`int`, *optional*, defaults to 32768):
38
- Vocabulary size of the Doge model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
39
  hidden_size (`int`, *optional*, defaults to 1024):
40
  Dimension of the hidden representations.
41
  intermediate_size (`int`, *optional*, defaults to 2048):
42
  Dimension of the MLP representations.
43
  num_hidden_layers (`int`, *optional*, defaults to 32):
44
  Number of hidden layers in the Transformer decoder.
45
- hidden_bias (`bool`, *optional*, defaults to `False`):
46
- Whether to use bias in the hidden layers.
47
  hidden_dropout (`float`, *optional*, defaults to 0.0):
48
  Dropout probability for each sequence transformation and state transformation module.
49
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
@@ -55,14 +52,8 @@ class DogeConfig(PretrainedConfig):
55
  use_cache (`bool`, *optional*, defaults to `True`):
56
  Whether or not the model should return the last key/values attentions (not used by all models). Only
57
  relevant if `config.is_decoder=True`.
58
- bos_token_id (`int`, *optional*, defaults to 0):
59
- Beginning of stream token id.
60
- eos_token_id (`int`, *optional*, defaults to 1):
61
- End of stream token id.
62
- pad_token_id (`int`, *optional*, defaults to 2):
63
- Padding token id.
64
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
65
- Whether to tie weight embeddings
66
  max_position_embeddings (`int`, *optional*, defaults to 2048):
67
  The maximum sequence length that this model might ever be used with.
68
  rope_theta (`float`, *optional*, defaults to 10000.0):
@@ -109,18 +100,29 @@ class DogeConfig(PretrainedConfig):
109
  When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
110
  For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
111
  If it is not specified, will default to `num_attention_heads`.
 
 
112
  attention_dropout (`float`, *optional*, defaults to 0.0):
113
  The dropout ratio for the attention probabilities.
 
 
 
 
114
  keep_window_size (`int`, *optional*, defaults to 2048):
115
  The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
116
- dynamic_mask_ratio (`float`, *optional*, defaults to 0.0):
117
- The ratio to control the proportion of the dynamic mask filled with the minimum value. For more details checkout [this paper](https://arxiv.org/pdf/2412.11834).
118
  is_moe (`bool`, *optional*, defaults to `False`):
119
- Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize. For more details checkout [this paper](https://arxiv.org/pdf/2412.11834).
120
- num_experts (`int`, *optional*, defaults to 2048):
121
- Number of Experts for the Cross Domain Mixture of Experts.
122
- num_experts_per_tok (`int`, *optional*, defaults to 8):
123
  Number of selected experts to route per-token.
 
 
 
 
 
 
 
124
 
125
  ```python
126
  >>> from transformers import DogeConfig, DogeModel
@@ -144,12 +146,22 @@ class DogeConfig(PretrainedConfig):
144
  "layers.*.self_attn.v_proj": "colwise",
145
  "layers.*.self_attn.dt_proj": "rowwise",
146
  "layers.*.self_attn.o_proj": "rowwise",
147
- "layers.*.feed_forward.gate_proj": "colwise",
148
- "layers.*.feed_forward.up_proj": "colwise",
149
- "layers.*.feed_forward.down_proj": "rowwise",
150
- "layers.*.feed_forward.router_gate": "colwise",
151
- "layers.*.feed_forward.down_embed": "rowwise",
152
- "layers.*.feed_forward.up_embed": "rowwise",
 
 
 
 
 
 
 
 
 
 
153
  }
154
 
155
  def __init__(
@@ -158,27 +170,28 @@ class DogeConfig(PretrainedConfig):
158
  hidden_size=1024,
159
  intermediate_size=2048,
160
  num_hidden_layers=32,
161
- hidden_bias=False,
162
  hidden_dropout=0.0,
163
  hidden_act="silu",
164
  initializer_range=0.02,
165
  rms_norm_eps=1e-06,
166
  use_cache=True,
167
- bos_token_id=0,
168
- eos_token_id=1,
169
- pad_token_id=2,
170
  tie_word_embeddings=False,
171
  max_position_embeddings=2048,
172
  rope_theta=10000.0,
173
  rope_scaling=None,
174
  num_attention_heads=8,
175
  num_key_value_heads=None,
 
176
  attention_dropout=0.0,
 
 
177
  keep_window_size=2048,
178
- dynamic_mask_ratio=0.0,
179
  is_moe=False,
180
- num_experts=2048,
181
- num_experts_per_tok=8,
 
 
 
182
  **kwargs,
183
  ):
184
  self.vocab_size = vocab_size
@@ -186,7 +199,6 @@ class DogeConfig(PretrainedConfig):
186
  self.intermediate_size = intermediate_size
187
  self.num_hidden_layers = num_hidden_layers
188
 
189
- self.hidden_bias = hidden_bias
190
  self.hidden_dropout = hidden_dropout
191
  self.hidden_act = hidden_act
192
  self.initializer_range = initializer_range
@@ -198,12 +210,17 @@ class DogeConfig(PretrainedConfig):
198
  self.rope_scaling = rope_scaling
199
  self.num_attention_heads = num_attention_heads
200
  self.num_key_value_heads = num_key_value_heads
 
201
  self.attention_dropout = attention_dropout
 
 
202
  self.keep_window_size = keep_window_size
203
- self.dynamic_mask_ratio = dynamic_mask_ratio
204
  self.is_moe = is_moe
205
  self.num_experts = num_experts
206
  self.num_experts_per_tok = num_experts_per_tok
 
 
 
207
 
208
  # Validate the correctness of rotary position embeddings parameters
209
  # BC: if there is a 'type' field, copy it it to 'rope_type'.
@@ -216,9 +233,6 @@ class DogeConfig(PretrainedConfig):
216
  self.num_key_value_heads = num_attention_heads
217
 
218
  super().__init__(
219
- bos_token_id=bos_token_id,
220
- eos_token_id=eos_token_id,
221
- pad_token_id=pad_token_id,
222
  tie_word_embeddings=tie_word_embeddings,
223
  **kwargs,
224
  )
 
5
  # modular_doge.py file directly. One of our CI enforces this.
6
  # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
  # coding=utf-8
8
+ # Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
9
  #
10
+ # The Doge family of small language models is trained by SmallDoge Team.
 
11
  #
12
  # Licensed under the Apache License, Version 2.0 (the "License");
13
  # you may not use this file except in compliance with the License.
 
27
  class DogeConfig(PretrainedConfig):
28
  r"""
29
  This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
30
+ model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).
31
 
32
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
  documentation from [`PretrainedConfig`] for more information.
34
 
35
  Args:
36
  vocab_size (`int`, *optional*, defaults to 32768):
37
+ Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
38
  hidden_size (`int`, *optional*, defaults to 1024):
39
  Dimension of the hidden representations.
40
  intermediate_size (`int`, *optional*, defaults to 2048):
41
  Dimension of the MLP representations.
42
  num_hidden_layers (`int`, *optional*, defaults to 32):
43
  Number of hidden layers in the Transformer decoder.
 
 
44
  hidden_dropout (`float`, *optional*, defaults to 0.0):
45
  Dropout probability for each sequence transformation and state transformation module.
46
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
 
52
  use_cache (`bool`, *optional*, defaults to `True`):
53
  Whether or not the model should return the last key/values attentions (not used by all models). Only
54
  relevant if `config.is_decoder=True`.
 
 
 
 
 
 
55
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
56
+ Whether the model's input and output word embeddings should be tied.
57
  max_position_embeddings (`int`, *optional*, defaults to 2048):
58
  The maximum sequence length that this model might ever be used with.
59
  rope_theta (`float`, *optional*, defaults to 10000.0):
 
100
  When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
101
  For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
102
  If it is not specified, will default to `num_attention_heads`.
103
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
104
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
105
  attention_dropout (`float`, *optional*, defaults to 0.0):
106
  The dropout ratio for the attention probabilities.
107
+ mlp_bias (`bool`, *optional*, defaults to `False`):
108
+ Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
109
+ sliding_window (`int`, *optional*):
110
+ Sliding window attention window size. If not specified, will default to `None`.
111
  keep_window_size (`int`, *optional*, defaults to 2048):
112
  The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
 
 
113
  is_moe (`bool`, *optional*, defaults to `False`):
114
+ Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
115
+ num_experts (`int`, *optional*, defaults to 16384):
116
+ Number of routed experts in the model. This is only used when `is_moe=True`.
117
+ num_experts_per_tok (`int`, *optional*, defaults to 64):
118
  Number of selected experts to route per-token.
119
+ norm_topk_prob (`bool`, *optional*, defaults to `False`):
120
+ Whether to normalize the topk probabilities.
121
+ output_router_logits (`bool`, *optional*, defaults to `False`):
122
+ Whether or not the router logits should be returned by the model. Enabling this will also
123
+ allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
124
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
125
+ The aux loss factor for the total loss.
126
 
127
  ```python
128
  >>> from transformers import DogeConfig, DogeModel
 
146
  "layers.*.self_attn.v_proj": "colwise",
147
  "layers.*.self_attn.dt_proj": "rowwise",
148
  "layers.*.self_attn.o_proj": "rowwise",
149
+ "layers.*.input_layernorm.weight": "sequence_parallel",
150
+ "layers.*.input_residual.weight": "sequence_parallel",
151
+ "layers.*.post_attention_layernorm.weight": "sequence_parallel",
152
+ "layers.*.post_attention_residual.weight": "sequence_parallel",
153
+ "norm.weight": "sequence_parallel",
154
+ "layers.*.mlp.gate_proj": "colwise",
155
+ "layers.*.mlp.up_proj": "colwise",
156
+ "layers.*.mlp.down_proj": "rowwise",
157
+ "layers.*.mlp.router_gate": "colwise_rep",
158
+ "layers.*.mlp.down_embed": "rowwise_rep",
159
+ "layers.*.mlp.up_embed": "rowwise_rep",
160
+ }
161
+ base_model_pp_plan = {
162
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
163
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
164
+ "norm": (["hidden_states"], ["hidden_states"]),
165
  }
166
 
167
  def __init__(
 
170
  hidden_size=1024,
171
  intermediate_size=2048,
172
  num_hidden_layers=32,
 
173
  hidden_dropout=0.0,
174
  hidden_act="silu",
175
  initializer_range=0.02,
176
  rms_norm_eps=1e-06,
177
  use_cache=True,
 
 
 
178
  tie_word_embeddings=False,
179
  max_position_embeddings=2048,
180
  rope_theta=10000.0,
181
  rope_scaling=None,
182
  num_attention_heads=8,
183
  num_key_value_heads=None,
184
+ attention_bias=False,
185
  attention_dropout=0.0,
186
+ mlp_bias=False,
187
+ sliding_window=None,
188
  keep_window_size=2048,
 
189
  is_moe=False,
190
+ num_experts=16384,
191
+ num_experts_per_tok=64,
192
+ norm_topk_prob=False,
193
+ output_router_logits=False,
194
+ router_aux_loss_coef=0.001,
195
  **kwargs,
196
  ):
197
  self.vocab_size = vocab_size
 
199
  self.intermediate_size = intermediate_size
200
  self.num_hidden_layers = num_hidden_layers
201
 
 
202
  self.hidden_dropout = hidden_dropout
203
  self.hidden_act = hidden_act
204
  self.initializer_range = initializer_range
 
210
  self.rope_scaling = rope_scaling
211
  self.num_attention_heads = num_attention_heads
212
  self.num_key_value_heads = num_key_value_heads
213
+ self.attention_bias = attention_bias
214
  self.attention_dropout = attention_dropout
215
+ self.mlp_bias = mlp_bias
216
+ self.sliding_window = sliding_window
217
  self.keep_window_size = keep_window_size
 
218
  self.is_moe = is_moe
219
  self.num_experts = num_experts
220
  self.num_experts_per_tok = num_experts_per_tok
221
+ self.norm_topk_prob = norm_topk_prob
222
+ self.output_router_logits = output_router_logits
223
+ self.router_aux_loss_coef = router_aux_loss_coef
224
 
225
  # Validate the correctness of rotary position embeddings parameters
226
  # BC: if there is a 'type' field, copy it it to 'rope_type'.
 
233
  self.num_key_value_heads = num_attention_heads
234
 
235
  super().__init__(
 
 
 
236
  tie_word_embeddings=tie_word_embeddings,
237
  **kwargs,
238
  )