Upload folder using huggingface_hub
Browse files- README.md +63 -3
- config.json +7 -5
- mergekit_config.yml +36 -0
- model-00001-of-00031.safetensors +2 -2
- model-00002-of-00031.safetensors +2 -2
- model-00003-of-00031.safetensors +2 -2
- model-00004-of-00031.safetensors +2 -2
- model-00005-of-00031.safetensors +2 -2
- model-00006-of-00031.safetensors +2 -2
- model-00007-of-00031.safetensors +2 -2
- model-00008-of-00031.safetensors +1 -1
- model-00009-of-00031.safetensors +2 -2
- model-00010-of-00031.safetensors +2 -2
- model-00011-of-00031.safetensors +2 -2
- model-00012-of-00031.safetensors +1 -1
- model-00013-of-00031.safetensors +2 -2
- model-00014-of-00031.safetensors +2 -2
- model-00015-of-00031.safetensors +2 -2
- model-00016-of-00031.safetensors +1 -1
- model-00017-of-00031.safetensors +2 -2
- model-00018-of-00031.safetensors +2 -2
- model-00019-of-00031.safetensors +2 -2
- model-00020-of-00031.safetensors +1 -1
- model-00021-of-00031.safetensors +2 -2
- model-00022-of-00031.safetensors +2 -2
- model-00023-of-00031.safetensors +2 -2
- model-00024-of-00031.safetensors +1 -1
- model-00025-of-00031.safetensors +2 -2
- model-00026-of-00031.safetensors +2 -2
- model-00027-of-00031.safetensors +2 -2
- model-00028-of-00031.safetensors +1 -1
- model-00029-of-00031.safetensors +2 -2
- model-00030-of-00031.safetensors +2 -2
- model-00031-of-00031.safetensors +2 -2
- model.safetensors.index.json +0 -0
- nobias.py +55 -0
- phase1.py +138 -0
- phase2.yaml +36 -0
- prepare_donor.py +67 -0
- prepare_donor_v2.py +76 -0
- prepare_donor_v3.py +92 -0
- prepare_donor_v5.py +74 -0
- prepare_donor_v6.py +80 -0
README.md
CHANGED
@@ -1,5 +1,65 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
|
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: []
|
3 |
+
library_name: transformers
|
4 |
+
tags:
|
5 |
+
- mergekit
|
6 |
+
- merge
|
7 |
|
8 |
+
---
|
9 |
+
# Qwen3-72B-Instruct
|
10 |
|
11 |
+
This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
|
12 |
+
|
13 |
+
## Merge Details
|
14 |
+
### Merge Method
|
15 |
+
|
16 |
+
This model was merged using the [Linear](https://arxiv.org/abs/2203.05482) merge method using ./Qwen3-32B-Upscaled as a base.
|
17 |
+
|
18 |
+
### Models Merged
|
19 |
+
|
20 |
+
The following models were included in the merge:
|
21 |
+
* ./Qwen2.5-72B-Instruct-Aligned
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
The following YAML configuration was used to produce this model:
|
26 |
+
|
27 |
+
```yaml
|
28 |
+
merge_method: linear
|
29 |
+
|
30 |
+
base_model: ./Qwen3-32B-Upscaled
|
31 |
+
dtype: bfloat16
|
32 |
+
slices:
|
33 |
+
- merge_method: linear
|
34 |
+
sources:
|
35 |
+
- model: ./Qwen3-32B-Upscaled
|
36 |
+
layer_range: [0, 32]
|
37 |
+
parameters:
|
38 |
+
weight: 0.5
|
39 |
+
- model: ./Qwen2.5-72B-Instruct-Aligned
|
40 |
+
layer_range: [0, 32]
|
41 |
+
parameters:
|
42 |
+
weight: 0.5
|
43 |
+
- merge_method: linear
|
44 |
+
sources:
|
45 |
+
- model: ./Qwen3-32B-Upscaled
|
46 |
+
layer_range: [32, 48]
|
47 |
+
parameters:
|
48 |
+
weight: 0.0
|
49 |
+
- model: ./Qwen2.5-72B-Instruct-Aligned
|
50 |
+
layer_range: [32, 48]
|
51 |
+
parameters:
|
52 |
+
weight: 1.0
|
53 |
+
- merge_method: linear
|
54 |
+
sources:
|
55 |
+
- model: ./Qwen3-32B-Upscaled
|
56 |
+
layer_range: [32, 64]
|
57 |
+
parameters:
|
58 |
+
weight: 0.5
|
59 |
+
- model: ./Qwen2.5-72B-Instruct-Aligned
|
60 |
+
layer_range: [48, 80]
|
61 |
+
parameters:
|
62 |
+
weight: 0.5
|
63 |
+
tokenizer_source: ./Qwen3-32B-Upscaled
|
64 |
+
|
65 |
+
```
|
config.json
CHANGED
@@ -4,13 +4,15 @@
|
|
4 |
],
|
5 |
"attention_bias": false,
|
6 |
"attention_dropout": 0.0,
|
|
|
|
|
7 |
"head_dim": 128,
|
8 |
"hidden_act": "silu",
|
9 |
"hidden_size": 8192,
|
10 |
"initializer_range": 0.02,
|
11 |
"intermediate_size": 29568,
|
12 |
-
"max_position_embeddings":
|
13 |
-
"max_window_layers":
|
14 |
"model_type": "qwen3",
|
15 |
"num_attention_heads": 64,
|
16 |
"num_hidden_layers": 80,
|
@@ -18,11 +20,11 @@
|
|
18 |
"rms_norm_eps": 1e-06,
|
19 |
"rope_scaling": null,
|
20 |
"rope_theta": 1000000,
|
21 |
-
"sliding_window":
|
22 |
-
"tie_word_embeddings":
|
23 |
"torch_dtype": "bfloat16",
|
24 |
"transformers_version": "4.51.3",
|
25 |
"use_cache": true,
|
26 |
"use_sliding_window": false,
|
27 |
-
"vocab_size":
|
28 |
}
|
|
|
4 |
],
|
5 |
"attention_bias": false,
|
6 |
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 151643,
|
8 |
+
"eos_token_id": 151645,
|
9 |
"head_dim": 128,
|
10 |
"hidden_act": "silu",
|
11 |
"hidden_size": 8192,
|
12 |
"initializer_range": 0.02,
|
13 |
"intermediate_size": 29568,
|
14 |
+
"max_position_embeddings": 40960,
|
15 |
+
"max_window_layers": 64,
|
16 |
"model_type": "qwen3",
|
17 |
"num_attention_heads": 64,
|
18 |
"num_hidden_layers": 80,
|
|
|
20 |
"rms_norm_eps": 1e-06,
|
21 |
"rope_scaling": null,
|
22 |
"rope_theta": 1000000,
|
23 |
+
"sliding_window": null,
|
24 |
+
"tie_word_embeddings": false,
|
25 |
"torch_dtype": "bfloat16",
|
26 |
"transformers_version": "4.51.3",
|
27 |
"use_cache": true,
|
28 |
"use_sliding_window": false,
|
29 |
+
"vocab_size": 151669
|
30 |
}
|
mergekit_config.yml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
merge_method: linear
|
2 |
+
|
3 |
+
base_model: ./Qwen3-32B-Upscaled
|
4 |
+
dtype: bfloat16
|
5 |
+
slices:
|
6 |
+
- merge_method: linear
|
7 |
+
sources:
|
8 |
+
- model: ./Qwen3-32B-Upscaled
|
9 |
+
layer_range: [0, 32]
|
10 |
+
parameters:
|
11 |
+
weight: 0.5
|
12 |
+
- model: ./Qwen2.5-72B-Instruct-Aligned
|
13 |
+
layer_range: [0, 32]
|
14 |
+
parameters:
|
15 |
+
weight: 0.5
|
16 |
+
- merge_method: linear
|
17 |
+
sources:
|
18 |
+
- model: ./Qwen3-32B-Upscaled
|
19 |
+
layer_range: [32, 48]
|
20 |
+
parameters:
|
21 |
+
weight: 0.0
|
22 |
+
- model: ./Qwen2.5-72B-Instruct-Aligned
|
23 |
+
layer_range: [32, 48]
|
24 |
+
parameters:
|
25 |
+
weight: 1.0
|
26 |
+
- merge_method: linear
|
27 |
+
sources:
|
28 |
+
- model: ./Qwen3-32B-Upscaled
|
29 |
+
layer_range: [32, 64]
|
30 |
+
parameters:
|
31 |
+
weight: 0.5
|
32 |
+
- model: ./Qwen2.5-72B-Instruct-Aligned
|
33 |
+
layer_range: [48, 80]
|
34 |
+
parameters:
|
35 |
+
weight: 0.5
|
36 |
+
tokenizer_source: ./Qwen3-32B-Upscaled
|
model-00001-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35fbeb0f8b3cc326f8c6e7f4f1747c84381c709ea16de9e7e67fb1136fd57baa
|
3 |
+
size 4969906520
|
model-00002-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b85466d7bc425ed4e9295b56d5c11a9f42750c9ba40b68a11e923397303518b2
|
3 |
+
size 4980822456
|
model-00003-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a05b876c8eba435b5fe793df62beba4048c806a5dbfb74225adb98542e8da9c
|
3 |
+
size 4764815928
|
model-00004-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d8b129f358836e1449ef24ef2d1a0d05cc368ff20650246d3328f5d3517b024
|
3 |
+
size 4781610128
|
model-00005-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57a2d8351dba42247ffe685803eddba72fa392732a21afb5a81d88a21e901207
|
3 |
+
size 4781610136
|
model-00006-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4fc7d5d413012b176cf9884cf32889ecb7e358995d6b8353176ff7adf625c9fe
|
3 |
+
size 4980822464
|
model-00007-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99d7fc43aa7b55c000c5df01672bce4e05eba43e6f9a80cad47fef843663ef46
|
3 |
+
size 4764815928
|
model-00008-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4781610128
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6ef65e8e5f342e8747cad3e210c936469b381f1fb3215dea036ac4a083be857d
|
3 |
size 4781610128
|
model-00009-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4989ca0f424300e14cf500c4e1f22d83e99778b250ef6e70165a9f1d86d6cb8c
|
3 |
+
size 4781610136
|
model-00010-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3faf79baba0d399730c353d423776e1033687a8eccde788c5065f407ed3515a0
|
3 |
+
size 4980822464
|
model-00011-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53105aa13fa33194ca4c95b1ba4d12d05a47018d5e44aafa292571bba166ade9
|
3 |
+
size 4764815928
|
model-00012-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4781610128
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ffe444539746c03a2a4ad5bf8d2d3eb67a3f46309e6789e2fa654ec0f0f8436
|
3 |
size 4781610128
|
model-00013-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:480be5925f312b1704aeec6d09328360f50a5cb495ee8a8231677f0a76d20a22
|
3 |
+
size 4781610136
|
model-00014-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7cc5b212b3a9fae6e8548e22807366fe41d8c5a3f04278e1f12bc2043c9eb63
|
3 |
+
size 4980822464
|
model-00015-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6cc80add3c83efda7d085964764fd22d5099bc51368c8833440e04ea429ca7d
|
3 |
+
size 4764815928
|
model-00016-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4781610128
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee8fbe0b58d99f6654e80f11d69f8239c60e8e4ee760b4601fa5d2692ec5f46f
|
3 |
size 4781610128
|
model-00017-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c163c04228a439b75e0b469b803079d43a108ad8bb81e35506765c5b252175f
|
3 |
+
size 4781610136
|
model-00018-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b61add16c1758adefb8191307e55469f0cf1158b3c3f8841f96698a6b94b777b
|
3 |
+
size 4980822464
|
model-00019-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c386662afbb04595217e683f31a62be86a8c52680c592e81d0f8cb5be7b503f6
|
3 |
+
size 4764815928
|
model-00020-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4781610128
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f79268be94b6db9da1ec22a1eb230c0894f2ae7dba79860af5a4795f2fa5e4a7
|
3 |
size 4781610128
|
model-00021-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a5312b78c7d05fbebd78dc5f7a10d7639fe3729f38f8a97de5b07706db19bd2
|
3 |
+
size 4781610136
|
model-00022-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3193ab90c1d8350b500e83bd2e7948c29a68dcbeb54e12bb50ca5c610ebbaa62
|
3 |
+
size 4980822464
|
model-00023-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac46a9298e7f5a0578217c6eccff85fa354965506a22908c6c8aa50cd38cd8e1
|
3 |
+
size 4764815928
|
model-00024-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4781610128
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e37ff960808f0c0753925e5d6cece3cbeeca5aebec1cd110e1c134eae63dc665
|
3 |
size 4781610128
|
model-00025-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4b20bee0cbc6f893d75c6beb4b82411361150025050047e0f9f09ddb446aada
|
3 |
+
size 4781610136
|
model-00026-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:467f58dafdd9fa9dc3c51be48e6f097714c0d87866cc6b85a2d7ce5c820178a8
|
3 |
+
size 4980822464
|
model-00027-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:791e4dec823afbb3fbc4228704df1c15ed89c91b5e82e2526578ac9144293e74
|
3 |
+
size 4764815928
|
model-00028-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4781610128
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6216bab7883e6bd27a487443253f8bcec6de9c46c73f65101a1c1b8ac541ee85
|
3 |
size 4781610128
|
model-00029-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0481518f9a06b33a06cac2cea65b01afda9c7393278713754d1bcd6b7f017c2e
|
3 |
+
size 4781610136
|
model-00030-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd8d12ed2fe337c6963b5c5f3229fb3b3b9a8c90701185c645649fd96dd6442f
|
3 |
+
size 4980822456
|
model-00031-of-00031.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:289f1c97a620d890d159d0cd21c44db07345bb47a0df6b35ee1e9f4159deb536
|
3 |
+
size 285229888
|
model.safetensors.index.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
nobias.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# remove_biases.py
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import argparse
|
5 |
+
from tqdm import tqdm
|
6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
7 |
+
|
8 |
+
def main(source_model_id, output_path):
|
9 |
+
"""
|
10 |
+
Loads a model, removes all tensors ending in '.bias', and saves the result.
|
11 |
+
"""
|
12 |
+
print(f"Loading source donor model: {source_model_id}")
|
13 |
+
# Load on CPU to avoid using VRAM
|
14 |
+
model = AutoModelForCausalLM.from_pretrained(
|
15 |
+
source_model_id,
|
16 |
+
torch_dtype=torch.bfloat16,
|
17 |
+
device_map="cpu",
|
18 |
+
trust_remote_code=True
|
19 |
+
)
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
|
21 |
+
|
22 |
+
source_state_dict = model.state_dict()
|
23 |
+
new_state_dict = {}
|
24 |
+
|
25 |
+
print("Removing all '.bias' tensors...")
|
26 |
+
removed_count = 0
|
27 |
+
for name, tensor in tqdm(source_state_dict.items(), desc="Processing Tensors"):
|
28 |
+
if name.endswith(".bias"):
|
29 |
+
removed_count += 1
|
30 |
+
continue # Skip this tensor
|
31 |
+
new_state_dict[name] = tensor
|
32 |
+
|
33 |
+
print(f"Removed {removed_count} bias tensors.")
|
34 |
+
|
35 |
+
# We don't need to create a new model from config, as the architecture is
|
36 |
+
# a subset of the original. We can load the new state dict with strict=False.
|
37 |
+
print("Loading the no-bias state dict back into the model...")
|
38 |
+
model.load_state_dict(new_state_dict, strict=False)
|
39 |
+
|
40 |
+
print(f"Saving the no-bias model and tokenizer to: {output_path}")
|
41 |
+
os.makedirs(output_path, exist_ok=True)
|
42 |
+
model.save_pretrained(output_path)
|
43 |
+
tokenizer.save_pretrained(output_path)
|
44 |
+
|
45 |
+
print("\nPhase 1b (No-Bias Donor Creation) Complete!")
|
46 |
+
print(f"The no-bias donor is ready at '{output_path}'.")
|
47 |
+
|
48 |
+
if __name__ == "__main__":
|
49 |
+
parser = argparse.ArgumentParser(description="Remove bias tensors from a model.")
|
50 |
+
parser.add_argument("--source_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The Hugging Face model ID of the source model.")
|
51 |
+
parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the no-bias model.")
|
52 |
+
args = parser.parse_args()
|
53 |
+
|
54 |
+
# Example: python remove_biases.py --output_path ./Qwen2.5-72B-Instruct-NoBias
|
55 |
+
main(args.source_model, args.output_path)
|
phase1.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
import argparse
|
4 |
+
from tqdm import tqdm
|
5 |
+
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
6 |
+
from accelerate import init_empty_weights # <-- IMPORT THE FIX
|
7 |
+
|
8 |
+
# --- Configuration ---
|
9 |
+
SRC_HIDDEN_SIZE = 5120
|
10 |
+
SRC_INTERMEDIATE_SIZE = 25600
|
11 |
+
TGT_HIDDEN_SIZE = 8192
|
12 |
+
TGT_INTERMEDIATE_SIZE = 29568
|
13 |
+
|
14 |
+
DELTA_HIDDEN = TGT_HIDDEN_SIZE - SRC_HIDDEN_SIZE
|
15 |
+
DELTA_INTERMEDIATE = TGT_INTERMEDIATE_SIZE - SRC_INTERMEDIATE_SIZE
|
16 |
+
|
17 |
+
# --- Interpolation Functions ---
|
18 |
+
def linear_interpolation(block1, block2, weight=0.5):
|
19 |
+
return (1 - weight) * block1 + weight * block2
|
20 |
+
|
21 |
+
def upscale_tensor(tensor: torch.Tensor, name: str) -> torch.Tensor:
|
22 |
+
# This function is correct from the previous version, simplified for brevity
|
23 |
+
if tensor.ndim == 1:
|
24 |
+
if tensor.shape[0] == SRC_HIDDEN_SIZE:
|
25 |
+
block1, block2 = tensor[:DELTA_HIDDEN], tensor[-DELTA_HIDDEN:]
|
26 |
+
return torch.cat([tensor, linear_interpolation(block1, block2)], dim=0)
|
27 |
+
elif tensor.ndim == 2:
|
28 |
+
if "embed_tokens" in name or "lm_head" in name:
|
29 |
+
if tensor.shape[1] == SRC_HIDDEN_SIZE:
|
30 |
+
block1, block2 = tensor[:, :DELTA_HIDDEN], tensor[:, -DELTA_HIDDEN:]
|
31 |
+
return torch.cat([tensor, linear_interpolation(block1, block2)], dim=1)
|
32 |
+
elif "self_attn" in name:
|
33 |
+
if "q_proj.weight" in name or "k_proj.weight" in name or "v_proj.weight" in name:
|
34 |
+
block1, block2 = tensor[:, :DELTA_HIDDEN], tensor[:, -DELTA_HIDDEN:]
|
35 |
+
return torch.cat([tensor, linear_interpolation(block1, block2)], dim=1)
|
36 |
+
elif "o_proj.weight" in name:
|
37 |
+
block1, block2 = tensor[:DELTA_HIDDEN, :], tensor[-DELTA_HIDDEN:, :]
|
38 |
+
return torch.cat([tensor, linear_interpolation(block1, block2)], dim=0)
|
39 |
+
elif "mlp" in name:
|
40 |
+
if "gate_proj.weight" in name or "up_proj.weight" in name:
|
41 |
+
row_block1, row_block2 = tensor[:DELTA_INTERMEDIATE, :], tensor[-DELTA_INTERMEDIATE:, :]
|
42 |
+
upscaled_rows = torch.cat([tensor, linear_interpolation(row_block1, row_block2)], dim=0)
|
43 |
+
col_block1, col_block2 = upscaled_rows[:, :DELTA_HIDDEN], upscaled_rows[:, -DELTA_HIDDEN:]
|
44 |
+
return torch.cat([upscaled_rows, linear_interpolation(col_block1, col_block2)], dim=1)
|
45 |
+
elif "down_proj.weight" in name:
|
46 |
+
row_block1, row_block2 = tensor[:DELTA_HIDDEN, :], tensor[-DELTA_HIDDEN:, :]
|
47 |
+
upscaled_rows = torch.cat([tensor, linear_interpolation(row_block1, row_block2)], dim=0)
|
48 |
+
col_block1, col_block2 = upscaled_rows[:, :DELTA_INTERMEDIATE], upscaled_rows[:, -DELTA_INTERMEDIATE:]
|
49 |
+
return torch.cat([upscaled_rows, linear_interpolation(col_block1, col_block2)], dim=1)
|
50 |
+
return tensor
|
51 |
+
|
52 |
+
def run_test_inference(model_path, prompt):
|
53 |
+
print("\n" + "="*50)
|
54 |
+
print("Running test inference...")
|
55 |
+
print("="*50)
|
56 |
+
|
57 |
+
# Load the newly saved model to ensure it works from disk
|
58 |
+
print(f"Loading model from disk: {model_path}")
|
59 |
+
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
|
60 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
61 |
+
|
62 |
+
print(f"Prompt: \"{prompt}\"")
|
63 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
64 |
+
print(f"Using device: {device}")
|
65 |
+
model.to(device)
|
66 |
+
|
67 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(device)
|
68 |
+
outputs = model.generate(**inputs, max_new_tokens=50, do_sample=False)
|
69 |
+
|
70 |
+
result_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
71 |
+
print("\n--- Generated Text ---")
|
72 |
+
print(result_text)
|
73 |
+
print("----------------------")
|
74 |
+
print("\nTest inference complete.")
|
75 |
+
|
76 |
+
|
77 |
+
def main(source_model_id, output_path):
|
78 |
+
print(f"Loading source model: {source_model_id}")
|
79 |
+
# Load on CPU to save VRAM
|
80 |
+
source_model = AutoModelForCausalLM.from_pretrained(
|
81 |
+
source_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
|
82 |
+
)
|
83 |
+
tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
|
84 |
+
|
85 |
+
source_state_dict = source_model.state_dict()
|
86 |
+
new_state_dict = {}
|
87 |
+
|
88 |
+
print("Up-scaling tensors using self-interpolation...")
|
89 |
+
for name, tensor in tqdm(source_state_dict.items(), desc="Processing Tensors"):
|
90 |
+
new_state_dict[name] = upscale_tensor(tensor.clone(), name)
|
91 |
+
|
92 |
+
# Free up memory from the source model before creating the new one
|
93 |
+
del source_model
|
94 |
+
del source_state_dict
|
95 |
+
|
96 |
+
print("\nCreating new model with target architecture...")
|
97 |
+
config = AutoConfig.from_pretrained(source_model_id, trust_remote_code=True)
|
98 |
+
config.hidden_size = TGT_HIDDEN_SIZE
|
99 |
+
config.intermediate_size = TGT_INTERMEDIATE_SIZE
|
100 |
+
config.torch_dtype = torch.bfloat16 # Ensure config specifies the correct dtype
|
101 |
+
|
102 |
+
# --- THIS IS THE FIX ---
|
103 |
+
# Initialize a 'skeleton' model without allocating memory. This is instant.
|
104 |
+
print("Step 1: Initializing empty 'skeleton' model on meta device (should be instantaneous)...")
|
105 |
+
with init_empty_weights():
|
106 |
+
new_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
|
107 |
+
print("Empty model created successfully.")
|
108 |
+
# ---------------------
|
109 |
+
|
110 |
+
# The model is now on the 'meta' device. We need to tie the weights to the CPU.
|
111 |
+
new_model.tie_weights()
|
112 |
+
|
113 |
+
print("\nStep 2: Loading up-scaled weights into the new model (this may take time and RAM)...")
|
114 |
+
# This step will materialize the weights on the CPU, consuming memory.
|
115 |
+
new_model.load_state_dict(new_state_dict, assign=True)
|
116 |
+
print("State dict loaded successfully.")
|
117 |
+
|
118 |
+
print(f"\nStep 3: Saving up-scaled model and tokenizer to: {output_path}")
|
119 |
+
os.makedirs(output_path, exist_ok=True)
|
120 |
+
new_model.save_pretrained(output_path)
|
121 |
+
tokenizer.save_pretrained(output_path)
|
122 |
+
|
123 |
+
print("\nPhase 1 (Self-Interpolation) Complete!")
|
124 |
+
print(f"The up-scaled model is ready at '{output_path}' for use with MergeKit.")
|
125 |
+
|
126 |
+
# --- Run Test Inference ---
|
127 |
+
# We pass the path instead of the model object to test loading from disk
|
128 |
+
run_test_inference(output_path, "The rain in Maine ")
|
129 |
+
|
130 |
+
if __name__ == "__main__":
|
131 |
+
parser = argparse.ArgumentParser(description="Upscale Qwen3-32B to hypothetical Qwen3-72B dimensions via self-interpolation.")
|
132 |
+
parser.add_argument("--source_model", type=str, default="Qwen/Qwen3-32B", help="The Hugging Face model ID of the source model.")
|
133 |
+
parser.add_argument(
|
134 |
+
"--output_path", type=str, default="./Qwen3-32B-Upscaled",
|
135 |
+
help="The local directory path to save the up-scaled model. (default: ./Qwen3-32B-Upscaled)"
|
136 |
+
)
|
137 |
+
args = parser.parse_args()
|
138 |
+
main(args.source_model, args.output_path)
|
phase2.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
merge_method: linear
|
2 |
+
|
3 |
+
base_model: ./Qwen3-32B-Upscaled
|
4 |
+
dtype: bfloat16
|
5 |
+
slices:
|
6 |
+
- merge_method: linear
|
7 |
+
sources:
|
8 |
+
- model: ./Qwen3-32B-Upscaled
|
9 |
+
layer_range: [0, 32]
|
10 |
+
parameters:
|
11 |
+
weight: 0.5
|
12 |
+
- model: ./Qwen2.5-72B-Instruct-Aligned
|
13 |
+
layer_range: [0, 32]
|
14 |
+
parameters:
|
15 |
+
weight: 0.5
|
16 |
+
- merge_method: linear
|
17 |
+
sources:
|
18 |
+
- model: ./Qwen3-32B-Upscaled
|
19 |
+
layer_range: [32, 48]
|
20 |
+
parameters:
|
21 |
+
weight: 0.0
|
22 |
+
- model: ./Qwen2.5-72B-Instruct-Aligned
|
23 |
+
layer_range: [32, 48]
|
24 |
+
parameters:
|
25 |
+
weight: 1.0
|
26 |
+
- merge_method: linear
|
27 |
+
sources:
|
28 |
+
- model: ./Qwen3-32B-Upscaled
|
29 |
+
layer_range: [32, 64]
|
30 |
+
parameters:
|
31 |
+
weight: 0.5
|
32 |
+
- model: ./Qwen2.5-72B-Instruct-Aligned
|
33 |
+
layer_range: [48, 80]
|
34 |
+
parameters:
|
35 |
+
weight: 0.5
|
36 |
+
tokenizer_source: ./Qwen3-32B-Upscaled
|
prepare_donor.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# prepare_donor.py
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import argparse
|
5 |
+
from tqdm import tqdm
|
6 |
+
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
7 |
+
|
8 |
+
def main(source_model_id, output_path):
|
9 |
+
"""
|
10 |
+
Loads a Qwen2.5 model, removes all '.bias' tensors, adds placeholder
|
11 |
+
'q_norm.weight' and 'k_norm.weight' tensors, and saves the result.
|
12 |
+
This creates an architecturally compatible donor for a Qwen3 merge.
|
13 |
+
"""
|
14 |
+
print(f"Loading source donor model: {source_model_id}")
|
15 |
+
# Load on CPU to save VRAM
|
16 |
+
model = AutoModelForCausalLM.from_pretrained(
|
17 |
+
source_model_id,
|
18 |
+
torch_dtype=torch.bfloat16,
|
19 |
+
device_map="cpu",
|
20 |
+
trust_remote_code=True
|
21 |
+
)
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
|
23 |
+
config = model.config
|
24 |
+
|
25 |
+
source_state_dict = model.state_dict()
|
26 |
+
new_state_dict = {}
|
27 |
+
|
28 |
+
# --- Part 1: Remove '.bias' tensors ---
|
29 |
+
print("Removing all '.bias' tensors...")
|
30 |
+
for name, tensor in tqdm(source_state_dict.items(), desc="Filtering Tensors"):
|
31 |
+
if not name.endswith(".bias"):
|
32 |
+
new_state_dict[name] = tensor
|
33 |
+
|
34 |
+
# --- Part 2: Add placeholder 'q_norm' and 'k_norm' tensors ---
|
35 |
+
print("Adding placeholder 'q_norm' and 'k_norm' tensors...")
|
36 |
+
# These norms are 1D vectors of size `head_dim` (128)
|
37 |
+
# A value of 1.0 is a standard, neutral initialization for a norm weight.
|
38 |
+
norm_dim = config.hidden_size // config.num_attention_heads # Should be 128 for this model
|
39 |
+
placeholder_norm = torch.ones(norm_dim, dtype=torch.bfloat16)
|
40 |
+
|
41 |
+
for i in tqdm(range(config.num_hidden_layers), desc="Adding Norm Tensors"):
|
42 |
+
q_norm_name = f"model.layers.{i}.self_attn.q_norm.weight"
|
43 |
+
k_norm_name = f"model.layers.{i}.self_attn.k_norm.weight"
|
44 |
+
new_state_dict[q_norm_name] = placeholder_norm.clone()
|
45 |
+
new_state_dict[k_norm_name] = placeholder_norm.clone()
|
46 |
+
|
47 |
+
# The original model is a fine container, we just need to load the modified state dict.
|
48 |
+
# strict=False is crucial because we have removed and added keys.
|
49 |
+
print("Loading the new state dict back into the model shell...")
|
50 |
+
model.load_state_dict(new_state_dict, strict=False, assign=True)
|
51 |
+
|
52 |
+
print(f"Saving the architecturally aligned model to: {output_path}")
|
53 |
+
os.makedirs(output_path, exist_ok=True)
|
54 |
+
model.save_pretrained(output_path)
|
55 |
+
tokenizer.save_pretrained(output_path)
|
56 |
+
|
57 |
+
print("\nDonor preparation complete!")
|
58 |
+
print(f"The aligned donor is ready at '{output_path}'.")
|
59 |
+
|
60 |
+
if __name__ == "__main__":
|
61 |
+
parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
|
62 |
+
parser.add_argument("--source_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The Hugging Face model ID of the source model.")
|
63 |
+
parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
|
64 |
+
args = parser.parse_args()
|
65 |
+
|
66 |
+
# Example: python prepare_donor.py --output_path ./Qwen2.5-72B-Instruct-Aligned
|
67 |
+
main(args.source_model, args.output_path)
|
prepare_donor_v2.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# prepare_donor_v2.py
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import argparse
|
5 |
+
from tqdm import tqdm
|
6 |
+
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
7 |
+
from accelerate import init_empty_weights
|
8 |
+
|
9 |
+
def main(foundation_model_id, donor_model_id, output_path):
|
10 |
+
"""
|
11 |
+
Creates a new 'Aligned' donor model.
|
12 |
+
1. Defines a target Qwen3 80-layer architecture using the foundation config.
|
13 |
+
2. Creates an empty model 'shell' with this pure Qwen3 architecture.
|
14 |
+
3. Fills the shell with weights from the Qwen2.5 donor, discarding incompatible tensors.
|
15 |
+
"""
|
16 |
+
print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
|
17 |
+
|
18 |
+
# Load the CONFIG from our Qwen3 foundation to get the correct blueprint
|
19 |
+
foundation_config = AutoConfig.from_pretrained(
|
20 |
+
foundation_model_id, trust_remote_code=True
|
21 |
+
)
|
22 |
+
|
23 |
+
# Modify the config to match the target 72B size and 80 layers
|
24 |
+
foundation_config.num_hidden_layers = 80
|
25 |
+
foundation_config.hidden_size = 8192
|
26 |
+
foundation_config.intermediate_size = 29568
|
27 |
+
foundation_config.torch_dtype = torch.bfloat16
|
28 |
+
|
29 |
+
# Create an empty 'shell' of the final model. This is instant and memory-efficient.
|
30 |
+
# Its config.json will be a pure Qwen3 config.
|
31 |
+
print("Creating empty Qwen3 80-layer model shell...")
|
32 |
+
with init_empty_weights():
|
33 |
+
aligned_model = AutoModelForCausalLM.from_config(
|
34 |
+
foundation_config, trust_remote_code=True
|
35 |
+
)
|
36 |
+
aligned_model.tie_weights()
|
37 |
+
print("Empty shell created successfully.")
|
38 |
+
|
39 |
+
print("\n--- Phase 2: Loading Donor Weights ---")
|
40 |
+
print(f"Loading weights from donor: {donor_model_id}")
|
41 |
+
|
42 |
+
# Load the donor model on CPU to get its state dict
|
43 |
+
donor_model = AutoModelForCausalLM.from_pretrained(
|
44 |
+
donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
|
45 |
+
)
|
46 |
+
donor_state_dict = donor_model.state_dict()
|
47 |
+
del donor_model # Free memory
|
48 |
+
|
49 |
+
# We will now load the donor weights into our pure Qwen3 shell.
|
50 |
+
# strict=False is essential because the donor has '.bias' tensors that our
|
51 |
+
# shell doesn't, and our shell has '.norm' tensors the donor doesn't.
|
52 |
+
# This will load all matching weights and ignore the rest.
|
53 |
+
print("Loading donor state_dict into the Qwen3 shell (strict=False)...")
|
54 |
+
aligned_model.load_state_dict(donor_state_dict, strict=False, assign=True)
|
55 |
+
|
56 |
+
# The '.norm' weights in the shell will keep their default initialization (1.0),
|
57 |
+
# which is exactly what we want for a neutral placeholder.
|
58 |
+
|
59 |
+
print("\n--- Phase 3: Saving the Aligned Donor ---")
|
60 |
+
tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
|
61 |
+
|
62 |
+
print(f"Saving the architecturally aligned model to: {output_path}")
|
63 |
+
os.makedirs(output_path, exist_ok=True)
|
64 |
+
aligned_model.save_pretrained(output_path)
|
65 |
+
tokenizer.save_pretrained(output_path)
|
66 |
+
|
67 |
+
print("\nDonor preparation complete! This is the definitive donor model.")
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
|
71 |
+
parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
|
72 |
+
parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
|
73 |
+
parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
|
74 |
+
args = parser.parse_args()
|
75 |
+
|
76 |
+
main(args.foundation_model, args.donor_model, args.output_path)
|
prepare_donor_v3.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# prepare_donor_v3.py
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import argparse
|
5 |
+
from tqdm import tqdm
|
6 |
+
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
7 |
+
from accelerate import init_empty_weights
|
8 |
+
|
9 |
+
def main(foundation_model_id, donor_model_id, output_path):
|
10 |
+
"""
|
11 |
+
Creates the definitive 'Aligned' donor model by manually handling all architectural mismatches.
|
12 |
+
1. Defines a target Qwen3 80-layer architecture.
|
13 |
+
2. Creates an empty Qwen3 model 'shell'.
|
14 |
+
3. Manually copies weights from the Qwen2.5 donor, truncating the vocabulary-related
|
15 |
+
tensors to fit the Qwen3 architecture.
|
16 |
+
"""
|
17 |
+
print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
|
18 |
+
|
19 |
+
foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)
|
20 |
+
|
21 |
+
# Target architecture: 80 layers, 72B dimensions, and Qwen3's vocab size
|
22 |
+
target_config = foundation_config
|
23 |
+
target_config.num_hidden_layers = 80
|
24 |
+
target_config.hidden_size = 8192
|
25 |
+
target_config.intermediate_size = 29568
|
26 |
+
target_config.vocab_size = 151936 # Explicitly set Qwen3 vocab size
|
27 |
+
target_config.torch_dtype = torch.bfloat16
|
28 |
+
|
29 |
+
print("Creating empty Qwen3 80-layer model shell...")
|
30 |
+
with init_empty_weights():
|
31 |
+
aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
|
32 |
+
aligned_model.tie_weights()
|
33 |
+
print("Empty shell created successfully.")
|
34 |
+
|
35 |
+
print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
|
36 |
+
print(f"Loading weights from donor: {donor_model_id}")
|
37 |
+
|
38 |
+
donor_model = AutoModelForCausalLM.from_pretrained(
|
39 |
+
donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
|
40 |
+
)
|
41 |
+
donor_state_dict = donor_model.state_dict()
|
42 |
+
del donor_model
|
43 |
+
|
44 |
+
# Get the state dict of our target shell to know the correct shapes
|
45 |
+
target_state_dict = aligned_model.state_dict()
|
46 |
+
new_state_dict = {}
|
47 |
+
|
48 |
+
print("Copying and aligning tensors one-by-one...")
|
49 |
+
for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
|
50 |
+
if name in donor_state_dict:
|
51 |
+
donor_tensor = donor_state_dict[name]
|
52 |
+
|
53 |
+
# --- THIS IS THE FIX ---
|
54 |
+
# If shapes match, copy directly.
|
55 |
+
if donor_tensor.shape == target_tensor.shape:
|
56 |
+
new_state_dict[name] = donor_tensor.clone()
|
57 |
+
# If shapes mismatch, handle the known vocabulary size difference.
|
58 |
+
else:
|
59 |
+
print(f" - Resolving shape mismatch for {name}:")
|
60 |
+
print(f" Donor shape: {donor_tensor.shape}, Target shape: {target_tensor.shape}")
|
61 |
+
# We know the mismatch is on the vocab dimension (dim 0).
|
62 |
+
# Truncate the donor tensor to fit the target shape.
|
63 |
+
vocab_dim = target_tensor.shape[0]
|
64 |
+
new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
|
65 |
+
else:
|
66 |
+
# This handles tensors that are in the Qwen3 shell but not the Qwen2.5 donor
|
67 |
+
# (i.e., q_norm.weight and k_norm.weight). We just keep the initialized value.
|
68 |
+
print(f" - Keeping initialized tensor for {name} (not in donor)")
|
69 |
+
new_state_dict[name] = target_tensor.clone()
|
70 |
+
|
71 |
+
print("Loading the fully aligned state_dict into the Qwen3 shell...")
|
72 |
+
# This load will now succeed because every tensor has the correct shape.
|
73 |
+
aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)
|
74 |
+
|
75 |
+
print("\n--- Phase 3: Saving the Aligned Donor ---")
|
76 |
+
tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
|
77 |
+
|
78 |
+
print(f"Saving the architecturally aligned model to: {output_path}")
|
79 |
+
os.makedirs(output_path, exist_ok=True)
|
80 |
+
aligned_model.save_pretrained(output_path)
|
81 |
+
tokenizer.save_pretrained(output_path)
|
82 |
+
|
83 |
+
print("\nDonor preparation complete! This is the definitive donor model.")
|
84 |
+
|
85 |
+
if __name__ == "__main__":
|
86 |
+
parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
|
87 |
+
parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
|
88 |
+
parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
|
89 |
+
parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
|
90 |
+
args = parser.parse_args()
|
91 |
+
|
92 |
+
main(args.foundation_model, args.donor_model, args.output_path)
|
prepare_donor_v5.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# prepare_donor_v5.py
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import argparse
|
5 |
+
from tqdm import tqdm
|
6 |
+
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
7 |
+
from accelerate import init_empty_weights
|
8 |
+
|
9 |
+
def main(foundation_model_id, donor_model_id, output_path):
|
10 |
+
print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
|
11 |
+
|
12 |
+
foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)
|
13 |
+
|
14 |
+
target_config = foundation_config
|
15 |
+
target_config.num_hidden_layers = 80
|
16 |
+
target_config.hidden_size = 8192
|
17 |
+
target_config.intermediate_size = 29568
|
18 |
+
target_config.vocab_size = 151936
|
19 |
+
target_config.torch_dtype = torch.bfloat16
|
20 |
+
|
21 |
+
print("Creating empty Qwen3 80-layer model shell...")
|
22 |
+
with init_empty_weights():
|
23 |
+
aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
|
24 |
+
# Don't tie weights yet, it can cause sharing issues.
|
25 |
+
# aligned_model.tie_weights()
|
26 |
+
print("Empty shell created successfully.")
|
27 |
+
|
28 |
+
print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
|
29 |
+
print(f"Loading weights from donor: {donor_model_id}")
|
30 |
+
|
31 |
+
donor_state_dict = AutoModelForCausalLM.from_pretrained(
|
32 |
+
donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
|
33 |
+
).state_dict()
|
34 |
+
|
35 |
+
target_state_dict = aligned_model.state_dict()
|
36 |
+
new_state_dict = {}
|
37 |
+
|
38 |
+
print("Copying and aligning tensors one-by-one...")
|
39 |
+
for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
|
40 |
+
if name in donor_state_dict:
|
41 |
+
donor_tensor = donor_state_dict[name]
|
42 |
+
if donor_tensor.shape == target_tensor.shape:
|
43 |
+
new_state_dict[name] = donor_tensor.clone()
|
44 |
+
else:
|
45 |
+
vocab_dim = target_tensor.shape[0]
|
46 |
+
new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
|
47 |
+
else:
|
48 |
+
# Force a unique copy for every tensor not in the donor.
|
49 |
+
new_state_dict[name] = target_tensor.clone()
|
50 |
+
|
51 |
+
print("Loading the fully aligned state_dict into the Qwen3 shell...")
|
52 |
+
aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)
|
53 |
+
|
54 |
+
# Tie the weights *after* loading the unique tensors.
|
55 |
+
aligned_model.tie_weights()
|
56 |
+
|
57 |
+
print("\n--- Phase 3: Saving the Aligned Donor ---")
|
58 |
+
tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
|
59 |
+
|
60 |
+
print(f"Saving the architecturally aligned model to: {output_path}")
|
61 |
+
os.makedirs(output_path, exist_ok=True)
|
62 |
+
aligned_model.save_pretrained(output_path)
|
63 |
+
tokenizer.save_pretrained(output_path)
|
64 |
+
|
65 |
+
print("\nDonor preparation complete! This is the definitive donor model.")
|
66 |
+
|
67 |
+
if __name__ == "__main__":
|
68 |
+
parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
|
69 |
+
parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
|
70 |
+
parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
|
71 |
+
parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
|
72 |
+
args = parser.parse_args()
|
73 |
+
|
74 |
+
main(args.foundation_model, args.donor_model, args.output_path)
|
prepare_donor_v6.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# prepare_donor_v6.py
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import argparse
|
5 |
+
from tqdm import tqdm
|
6 |
+
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
7 |
+
from accelerate import init_empty_weights
|
8 |
+
|
9 |
+
def main(foundation_model_id, donor_model_id, output_path):
|
10 |
+
"""
|
11 |
+
Creates the definitive 'Aligned' donor model by manually handling all architectural mismatches.
|
12 |
+
This version MANUALLY INSTANTIATES new tensors to defeat memory sharing optimizations.
|
13 |
+
"""
|
14 |
+
print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
|
15 |
+
|
16 |
+
foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)
|
17 |
+
|
18 |
+
target_config = foundation_config
|
19 |
+
target_config.num_hidden_layers = 80
|
20 |
+
target_config.hidden_size = 8192
|
21 |
+
target_config.intermediate_size = 29568
|
22 |
+
target_config.vocab_size = 151936
|
23 |
+
target_config.torch_dtype = torch.bfloat16
|
24 |
+
|
25 |
+
print("Creating empty Qwen3 80-layer model shell...")
|
26 |
+
with init_empty_weights():
|
27 |
+
aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
|
28 |
+
print("Empty shell created successfully.")
|
29 |
+
|
30 |
+
print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
|
31 |
+
print(f"Loading weights from donor: {donor_model_id}")
|
32 |
+
|
33 |
+
donor_state_dict = AutoModelForCausalLM.from_pretrained(
|
34 |
+
donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
|
35 |
+
).state_dict()
|
36 |
+
|
37 |
+
target_state_dict = aligned_model.state_dict()
|
38 |
+
new_state_dict = {}
|
39 |
+
|
40 |
+
print("Copying and aligning tensors one-by-one...")
|
41 |
+
for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
|
42 |
+
if name in donor_state_dict:
|
43 |
+
# This logic is for tensors that exist in the donor.
|
44 |
+
donor_tensor = donor_state_dict[name]
|
45 |
+
if donor_tensor.shape == target_tensor.shape:
|
46 |
+
new_state_dict[name] = donor_tensor.clone()
|
47 |
+
else: # Vocab mismatch case
|
48 |
+
vocab_dim = target_tensor.shape[0]
|
49 |
+
new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
|
50 |
+
else:
|
51 |
+
# --- THIS IS THE FINAL FIX ---
|
52 |
+
# This logic is for tensors NOT in the donor (q_norm, k_norm).
|
53 |
+
# We will not use the shared `target_tensor`. Instead, we create a new
|
54 |
+
# unique tensor of the correct shape and value for each one.
|
55 |
+
new_state_dict[name] = torch.ones(target_tensor.shape, dtype=torch.bfloat16)
|
56 |
+
|
57 |
+
print("Loading the fully aligned state_dict into the Qwen3 shell...")
|
58 |
+
aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)
|
59 |
+
|
60 |
+
# Tie weights *after* all unique tensors are loaded.
|
61 |
+
aligned_model.tie_weights()
|
62 |
+
|
63 |
+
print("\n--- Phase 3: Saving the Aligned Donor ---")
|
64 |
+
tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
|
65 |
+
|
66 |
+
print(f"Saving the architecturally aligned model to: {output_path}")
|
67 |
+
os.makedirs(output_path, exist_ok=True)
|
68 |
+
aligned_model.save_pretrained(output_path)
|
69 |
+
tokenizer.save_pretrained(output_path)
|
70 |
+
|
71 |
+
print("\nDonor preparation complete! This is the definitive donor model.")
|
72 |
+
|
73 |
+
if __name__ == "__main__":
|
74 |
+
parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
|
75 |
+
parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
|
76 |
+
parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
|
77 |
+
parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
|
78 |
+
args = parser.parse_args()
|
79 |
+
|
80 |
+
main(args.foundation_model, args.donor_model, args.output_path)
|