nuojohnchen commited on
Commit
53edf96
·
verified ·
1 Parent(s): 57a6f5c

Delete configuration_upcycling_qwen2_moe.py

Browse files
Files changed (1) hide show
  1. configuration_upcycling_qwen2_moe.py +0 -180
configuration_upcycling_qwen2_moe.py DELETED
@@ -1,180 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """Qwen2MoE model configuration"""
16
-
17
- from transformers.configuration_utils import PretrainedConfig
18
- from transformers.utils import logging
19
- import torch
20
-
21
-
22
- logger = logging.get_logger(__name__)
23
-
24
-
25
- class Qwen2Config(PretrainedConfig):
26
- def __init__(
27
- self,
28
- vocab_size=151936,
29
- hidden_size=4096,
30
- intermediate_size=22016,
31
- num_hidden_layers=32,
32
- num_attention_heads=32,
33
- num_key_value_heads=32,
34
- hidden_act="silu",
35
- max_position_embeddings=32768,
36
- initializer_range=0.02,
37
- rms_norm_eps=1e-6,
38
- use_cache=True,
39
- tie_word_embeddings=False,
40
- rope_theta=10000.0,
41
- use_sliding_window=False,
42
- sliding_window=4096,
43
- max_window_layers=28,
44
- attention_dropout=0.0,
45
- **kwargs,
46
- ):
47
- self.vocab_size = vocab_size
48
- self.max_position_embeddings = max_position_embeddings
49
- self.hidden_size = hidden_size
50
- self.intermediate_size = intermediate_size
51
- self.num_hidden_layers = num_hidden_layers
52
- self.num_attention_heads = num_attention_heads
53
- self.use_sliding_window = use_sliding_window
54
- self.sliding_window = sliding_window
55
- self.max_window_layers = max_window_layers
56
-
57
- # for backward compatibility
58
- if num_key_value_heads is None:
59
- num_key_value_heads = num_attention_heads
60
-
61
- self.num_key_value_heads = num_key_value_heads
62
- self.hidden_act = hidden_act
63
- self.initializer_range = initializer_range
64
- self.rms_norm_eps = rms_norm_eps
65
- self.use_cache = use_cache
66
- self.rope_theta = rope_theta
67
- self.attention_dropout = attention_dropout
68
-
69
- super().__init__(
70
- tie_word_embeddings=tie_word_embeddings,
71
- **kwargs,
72
- )
73
-
74
-
75
- class Qwen2MoeConfig(PretrainedConfig):
76
-
77
- model_type = "qwen2_moe"
78
- keys_to_ignore_at_inference = ["past_key_values"]
79
-
80
- def __init__(
81
- self,
82
- vocab_size=151936,
83
- hidden_size=2048,
84
- intermediate_size=5632,
85
- num_hidden_layers=24,
86
- num_attention_heads=16,
87
- num_key_value_heads=16,
88
- hidden_act="silu",
89
- max_position_embeddings=32768,
90
- initializer_range=0.02,
91
- rms_norm_eps=1e-6,
92
- use_cache=True,
93
- tie_word_embeddings=False,
94
- rope_theta=10000.0,
95
- use_sliding_window=False,
96
- sliding_window=4096,
97
- max_window_layers=28,
98
- attention_dropout=0.0,
99
-
100
- decoder_sparse_step=1,
101
- moe_intermediate_size=1408,
102
- shared_expert_intermediate_size=5632,
103
- num_experts_per_tok=4,
104
- num_experts=60,
105
- norm_topk_prob=False,
106
- output_router_logits=False,
107
- router_aux_loss_coef=0.001,
108
- mlp_only_layers=None,
109
- **kwargs,
110
- ):
111
- self.vocab_size = vocab_size
112
- self.max_position_embeddings = max_position_embeddings
113
- self.hidden_size = hidden_size
114
- self.intermediate_size = intermediate_size
115
- self.num_hidden_layers = num_hidden_layers
116
- self.num_attention_heads = num_attention_heads
117
- self.use_sliding_window = use_sliding_window
118
- self.sliding_window = sliding_window
119
- self.max_window_layers = max_window_layers
120
-
121
- self.num_key_value_heads = num_key_value_heads
122
- self.hidden_act = hidden_act
123
- self.initializer_range = initializer_range
124
- self.rms_norm_eps = rms_norm_eps
125
- self.use_cache = use_cache
126
- self.rope_theta = rope_theta
127
- self.attention_dropout = attention_dropout
128
-
129
- # MoE arguments
130
- self.decoder_sparse_step = decoder_sparse_step
131
- self.moe_intermediate_size = moe_intermediate_size
132
- self.shared_expert_intermediate_size = shared_expert_intermediate_size
133
- self.num_experts_per_tok = num_experts_per_tok
134
- self.num_experts = num_experts
135
- self.norm_topk_prob = norm_topk_prob
136
- self.output_router_logits = output_router_logits
137
- self.router_aux_loss_coef = router_aux_loss_coef
138
- self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
139
-
140
- super().__init__(
141
- tie_word_embeddings=tie_word_embeddings,
142
- **kwargs,
143
- )
144
-
145
-
146
- class UpcyclingQwen2MoeConfig(Qwen2Config):
147
- model_type="upcycling-qwen2-moe"
148
- #upcycling form Qwen2-1_5B
149
- def __init__(
150
- self,
151
- decoder_sparse_step=1,
152
- num_experts_per_tok=2,
153
- num_experts=7,
154
- norm_topk_prob=False,
155
- output_router_logits=False,
156
- router_aux_loss_coef=0.000,
157
- mlp_only_layers=None,#MoE only last 2 layers
158
- share_flag=False,
159
- attn_init_change=False,
160
- language_gate=True,
161
- **kwargs
162
- ):
163
- super().__init__(**kwargs)
164
- # MoE arguments
165
- self.decoder_sparse_step = decoder_sparse_step
166
- self.moe_intermediate_size = self.intermediate_size
167
- self.shared_expert_intermediate_size = self.intermediate_size
168
- self.norm_topk_prob = norm_topk_prob
169
- self.output_router_logits = output_router_logits
170
- self.router_aux_loss_coef = router_aux_loss_coef
171
- # self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
172
- self.mlp_only_layers=torch.arange(self.num_hidden_layers).tolist()[:-2]
173
- self.share_flag=share_flag
174
- self.num_experts_per_tok = num_experts_per_tok
175
- self.num_experts = num_experts
176
- self.attn_init_change=attn_init_change
177
- self.language_gate=language_gate
178
-
179
-
180
-