ehartford commited on
Commit
7e1725c
·
verified ·
1 Parent(s): ea3f081

Upload folder using huggingface_hub

Browse files
Files changed (43) hide show
  1. README.md +63 -3
  2. config.json +7 -5
  3. mergekit_config.yml +36 -0
  4. model-00001-of-00031.safetensors +2 -2
  5. model-00002-of-00031.safetensors +2 -2
  6. model-00003-of-00031.safetensors +2 -2
  7. model-00004-of-00031.safetensors +2 -2
  8. model-00005-of-00031.safetensors +2 -2
  9. model-00006-of-00031.safetensors +2 -2
  10. model-00007-of-00031.safetensors +2 -2
  11. model-00008-of-00031.safetensors +1 -1
  12. model-00009-of-00031.safetensors +2 -2
  13. model-00010-of-00031.safetensors +2 -2
  14. model-00011-of-00031.safetensors +2 -2
  15. model-00012-of-00031.safetensors +1 -1
  16. model-00013-of-00031.safetensors +2 -2
  17. model-00014-of-00031.safetensors +2 -2
  18. model-00015-of-00031.safetensors +2 -2
  19. model-00016-of-00031.safetensors +1 -1
  20. model-00017-of-00031.safetensors +2 -2
  21. model-00018-of-00031.safetensors +2 -2
  22. model-00019-of-00031.safetensors +2 -2
  23. model-00020-of-00031.safetensors +1 -1
  24. model-00021-of-00031.safetensors +2 -2
  25. model-00022-of-00031.safetensors +2 -2
  26. model-00023-of-00031.safetensors +2 -2
  27. model-00024-of-00031.safetensors +1 -1
  28. model-00025-of-00031.safetensors +2 -2
  29. model-00026-of-00031.safetensors +2 -2
  30. model-00027-of-00031.safetensors +2 -2
  31. model-00028-of-00031.safetensors +1 -1
  32. model-00029-of-00031.safetensors +2 -2
  33. model-00030-of-00031.safetensors +2 -2
  34. model-00031-of-00031.safetensors +2 -2
  35. model.safetensors.index.json +0 -0
  36. nobias.py +55 -0
  37. phase1.py +138 -0
  38. phase2.yaml +36 -0
  39. prepare_donor.py +67 -0
  40. prepare_donor_v2.py +76 -0
  41. prepare_donor_v3.py +92 -0
  42. prepare_donor_v5.py +74 -0
  43. prepare_donor_v6.py +80 -0
README.md CHANGED
@@ -1,5 +1,65 @@
1
- dont get excited
 
 
 
 
 
2
 
3
- its not working yet
 
4
 
5
- still working on it
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: []
3
+ library_name: transformers
4
+ tags:
5
+ - mergekit
6
+ - merge
7
 
8
+ ---
9
+ # Qwen3-72B-Instruct
10
 
11
+ This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
12
+
13
+ ## Merge Details
14
+ ### Merge Method
15
+
16
+ This model was merged using the [Linear](https://arxiv.org/abs/2203.05482) merge method using ./Qwen3-32B-Upscaled as a base.
17
+
18
+ ### Models Merged
19
+
20
+ The following models were included in the merge:
21
+ * ./Qwen2.5-72B-Instruct-Aligned
22
+
23
+ ### Configuration
24
+
25
+ The following YAML configuration was used to produce this model:
26
+
27
+ ```yaml
28
+ merge_method: linear
29
+
30
+ base_model: ./Qwen3-32B-Upscaled
31
+ dtype: bfloat16
32
+ slices:
33
+ - merge_method: linear
34
+ sources:
35
+ - model: ./Qwen3-32B-Upscaled
36
+ layer_range: [0, 32]
37
+ parameters:
38
+ weight: 0.5
39
+ - model: ./Qwen2.5-72B-Instruct-Aligned
40
+ layer_range: [0, 32]
41
+ parameters:
42
+ weight: 0.5
43
+ - merge_method: linear
44
+ sources:
45
+ - model: ./Qwen3-32B-Upscaled
46
+ layer_range: [32, 48]
47
+ parameters:
48
+ weight: 0.0
49
+ - model: ./Qwen2.5-72B-Instruct-Aligned
50
+ layer_range: [32, 48]
51
+ parameters:
52
+ weight: 1.0
53
+ - merge_method: linear
54
+ sources:
55
+ - model: ./Qwen3-32B-Upscaled
56
+ layer_range: [32, 64]
57
+ parameters:
58
+ weight: 0.5
59
+ - model: ./Qwen2.5-72B-Instruct-Aligned
60
+ layer_range: [48, 80]
61
+ parameters:
62
+ weight: 0.5
63
+ tokenizer_source: ./Qwen3-32B-Upscaled
64
+
65
+ ```
config.json CHANGED
@@ -4,13 +4,15 @@
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
 
 
7
  "head_dim": 128,
8
  "hidden_act": "silu",
9
  "hidden_size": 8192,
10
  "initializer_range": 0.02,
11
  "intermediate_size": 29568,
12
- "max_position_embeddings": 32768,
13
- "max_window_layers": 70,
14
  "model_type": "qwen3",
15
  "num_attention_heads": 64,
16
  "num_hidden_layers": 80,
@@ -18,11 +20,11 @@
18
  "rms_norm_eps": 1e-06,
19
  "rope_scaling": null,
20
  "rope_theta": 1000000,
21
- "sliding_window": 131072,
22
- "tie_word_embeddings": true,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.51.3",
25
  "use_cache": true,
26
  "use_sliding_window": false,
27
- "vocab_size": 151936
28
  }
 
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
  "head_dim": 128,
10
  "hidden_act": "silu",
11
  "hidden_size": 8192,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 29568,
14
+ "max_position_embeddings": 40960,
15
+ "max_window_layers": 64,
16
  "model_type": "qwen3",
17
  "num_attention_heads": 64,
18
  "num_hidden_layers": 80,
 
20
  "rms_norm_eps": 1e-06,
21
  "rope_scaling": null,
22
  "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": false,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.51.3",
27
  "use_cache": true,
28
  "use_sliding_window": false,
29
+ "vocab_size": 151669
30
  }
mergekit_config.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ merge_method: linear
2
+
3
+ base_model: ./Qwen3-32B-Upscaled
4
+ dtype: bfloat16
5
+ slices:
6
+ - merge_method: linear
7
+ sources:
8
+ - model: ./Qwen3-32B-Upscaled
9
+ layer_range: [0, 32]
10
+ parameters:
11
+ weight: 0.5
12
+ - model: ./Qwen2.5-72B-Instruct-Aligned
13
+ layer_range: [0, 32]
14
+ parameters:
15
+ weight: 0.5
16
+ - merge_method: linear
17
+ sources:
18
+ - model: ./Qwen3-32B-Upscaled
19
+ layer_range: [32, 48]
20
+ parameters:
21
+ weight: 0.0
22
+ - model: ./Qwen2.5-72B-Instruct-Aligned
23
+ layer_range: [32, 48]
24
+ parameters:
25
+ weight: 1.0
26
+ - merge_method: linear
27
+ sources:
28
+ - model: ./Qwen3-32B-Upscaled
29
+ layer_range: [32, 64]
30
+ parameters:
31
+ weight: 0.5
32
+ - model: ./Qwen2.5-72B-Instruct-Aligned
33
+ layer_range: [48, 80]
34
+ parameters:
35
+ weight: 0.5
36
+ tokenizer_source: ./Qwen3-32B-Upscaled
model-00001-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ebe119ba44e78c9050dbab69114c8f4834181422f92407f335be99b3bfed912
3
- size 4546661424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35fbeb0f8b3cc326f8c6e7f4f1747c84381c709ea16de9e7e67fb1136fd57baa
3
+ size 4969906520
model-00002-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ddddaf2fe9205022e521f60e4cfa6d23c7dbd2e4c3b1f013922eee3b4402a3f
3
- size 4964061232
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85466d7bc425ed4e9295b56d5c11a9f42750c9ba40b68a11e923397303518b2
3
+ size 4980822456
model-00003-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bc9ec6b8cc2f15b94365c6824047bcac3538e70c814fb3c74f314ed49dc2095
3
- size 4781577096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a05b876c8eba435b5fe793df62beba4048c806a5dbfb74225adb98542e8da9c
3
+ size 4764815928
model-00004-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e0088c1b4df8a82118246d2d5d7847551e3990513b25ecabcea9a944cb1516d
3
- size 4781610096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d8b129f358836e1449ef24ef2d1a0d05cc368ff20650246d3328f5d3517b024
3
+ size 4781610128
model-00005-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db62306ab03ca84de2789e5c1db2f954ce1a6b13784940305a593d7c0203a598
3
- size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57a2d8351dba42247ffe685803eddba72fa392732a21afb5a81d88a21e901207
3
+ size 4781610136
model-00006-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:720b31a361a12aa6fea6449fb90f312b4156b97c7ac63afcdb39c21584a39867
3
- size 4964061264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fc7d5d413012b176cf9884cf32889ecb7e358995d6b8353176ff7adf625c9fe
3
+ size 4980822464
model-00007-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97545eafa4a41817faea3a9471e73c5be245ea29e10cae7fd11736d749c2e285
3
- size 4781577128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99d7fc43aa7b55c000c5df01672bce4e05eba43e6f9a80cad47fef843663ef46
3
+ size 4764815928
model-00008-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a6d96fa787d0fb1eba7e84af0ae6f78301552b4676c79f010ba59c8c9667d30
3
  size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ef65e8e5f342e8747cad3e210c936469b381f1fb3215dea036ac4a083be857d
3
  size 4781610128
model-00009-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:502e80229011cefa3fe391e2b8b500656fae9692aa841846759ee258565c7508
3
- size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4989ca0f424300e14cf500c4e1f22d83e99778b250ef6e70165a9f1d86d6cb8c
3
+ size 4781610136
model-00010-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:753bca5be6b1f6e0042b5f42a33caff3936ad4d435b89bb9894635f65629c208
3
- size 4964061264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3faf79baba0d399730c353d423776e1033687a8eccde788c5065f407ed3515a0
3
+ size 4980822464
model-00011-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f428d399a792b9c179d8845bef256441696ed64a8c0e20880f4d62f4faf2515b
3
- size 4781577128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53105aa13fa33194ca4c95b1ba4d12d05a47018d5e44aafa292571bba166ade9
3
+ size 4764815928
model-00012-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78197a393251db752c94812c92f781c338ead23cce2a971b79a5e18139c95196
3
  size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ffe444539746c03a2a4ad5bf8d2d3eb67a3f46309e6789e2fa654ec0f0f8436
3
  size 4781610128
model-00013-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86330a1824a777c2105e6fe19c81b11e17e63446726f2c138b34dc1de5999493
3
- size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:480be5925f312b1704aeec6d09328360f50a5cb495ee8a8231677f0a76d20a22
3
+ size 4781610136
model-00014-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3362dc363af862d300f443002d16e29b3a84ac794e1c711f86e2fe25ca973d69
3
- size 4964061264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7cc5b212b3a9fae6e8548e22807366fe41d8c5a3f04278e1f12bc2043c9eb63
3
+ size 4980822464
model-00015-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3e4bd9b0f0b87fd3d8c71dcbf357aaf7a656d9d9c851a1e2d93f9542567a957
3
- size 4781577128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6cc80add3c83efda7d085964764fd22d5099bc51368c8833440e04ea429ca7d
3
+ size 4764815928
model-00016-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b446fa6f25cdb304c18d060653a75638b2030dae18e6b705c9365beb260a271
3
  size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee8fbe0b58d99f6654e80f11d69f8239c60e8e4ee760b4601fa5d2692ec5f46f
3
  size 4781610128
model-00017-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:281a9dcbb30b975a56b26d5d37fbfaf82a6d57fad6be5362c26d8a5f9759572e
3
- size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c163c04228a439b75e0b469b803079d43a108ad8bb81e35506765c5b252175f
3
+ size 4781610136
model-00018-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c79648f7404a3ec396dc0f3a318b9910ae97b64735a53505a2232be960fef15
3
- size 4964061264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b61add16c1758adefb8191307e55469f0cf1158b3c3f8841f96698a6b94b777b
3
+ size 4980822464
model-00019-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4141b7e5b83b531706dc7850ba99ea92ede5e45835b9b82f911c39a88bee4c30
3
- size 4781577128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c386662afbb04595217e683f31a62be86a8c52680c592e81d0f8cb5be7b503f6
3
+ size 4764815928
model-00020-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:549c338b690fac9f20dbd51166e9d7ea4637c949f79ea213217668a9bbfa34ab
3
  size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f79268be94b6db9da1ec22a1eb230c0894f2ae7dba79860af5a4795f2fa5e4a7
3
  size 4781610128
model-00021-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fe307829e356fc6ecead37cdad243cf3fdd832f92219e1c2d77053b4b975d26
3
- size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a5312b78c7d05fbebd78dc5f7a10d7639fe3729f38f8a97de5b07706db19bd2
3
+ size 4781610136
model-00022-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33ba2a5a0633406992b0931fa97848ac04b0e744110b21e75da9422cd4e51ec8
3
- size 4964061264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3193ab90c1d8350b500e83bd2e7948c29a68dcbeb54e12bb50ca5c610ebbaa62
3
+ size 4980822464
model-00023-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c338036e51aac0f2aa8810ffab9392dbba87e9d5d56b1ac641e2379cb1046b97
3
- size 4781577128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac46a9298e7f5a0578217c6eccff85fa354965506a22908c6c8aa50cd38cd8e1
3
+ size 4764815928
model-00024-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d68a1e351fbc3074f58b0bb6014f485a76796487400aae5b6540b5c24daf326a
3
  size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e37ff960808f0c0753925e5d6cece3cbeeca5aebec1cd110e1c134eae63dc665
3
  size 4781610128
model-00025-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c50e4f2b171237132b0349359974f26d9c6f3d03361046f5c260d22162ae2ac7
3
- size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4b20bee0cbc6f893d75c6beb4b82411361150025050047e0f9f09ddb446aada
3
+ size 4781610136
model-00026-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc179221ee6a2038c1def8fd8bc521bd5b8a8f42fae45c18d84fcab6075a976e
3
- size 4964061264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:467f58dafdd9fa9dc3c51be48e6f097714c0d87866cc6b85a2d7ce5c820178a8
3
+ size 4980822464
model-00027-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0fcd6b1e73f0311bc965e0c2be822891a59c4ddc9979cb25edfcbb6753045e4b
3
- size 4781577128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:791e4dec823afbb3fbc4228704df1c15ed89c91b5e82e2526578ac9144293e74
3
+ size 4764815928
model-00028-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fcfb6e484c433364cb232a8755a6b37c83b8cd6e9a29f35af8686895cd349ba
3
  size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6216bab7883e6bd27a487443253f8bcec6de9c46c73f65101a1c1b8ac541ee85
3
  size 4781610128
model-00029-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e947e91e2494dd122a5cac3a3dbe4c5ac4bcb82ddd36e8cc99c3c69e95891e2f
3
- size 4781610128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0481518f9a06b33a06cac2cea65b01afda9c7393278713754d1bcd6b7f017c2e
3
+ size 4781610136
model-00030-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e4db59843cff2e9ec0b2b70a5ec1a3e2ccb9c9380d24e0e0bbb9592a437b067
3
- size 3208726960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd8d12ed2fe337c6963b5c5f3229fb3b3b9a8c90701185c645649fd96dd6442f
3
+ size 4980822456
model-00031-of-00031.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ac450e7a68b764065b7dde03e3e7e54d64a5fb972567881ff144ce4a7c64220
3
- size 2489319552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:289f1c97a620d890d159d0cd21c44db07345bb47a0df6b35ee1e9f4159deb536
3
+ size 285229888
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
nobias.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # remove_biases.py
2
+ import torch
3
+ import os
4
+ import argparse
5
+ from tqdm import tqdm
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+
8
+ def main(source_model_id, output_path):
9
+ """
10
+ Loads a model, removes all tensors ending in '.bias', and saves the result.
11
+ """
12
+ print(f"Loading source donor model: {source_model_id}")
13
+ # Load on CPU to avoid using VRAM
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ source_model_id,
16
+ torch_dtype=torch.bfloat16,
17
+ device_map="cpu",
18
+ trust_remote_code=True
19
+ )
20
+ tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
21
+
22
+ source_state_dict = model.state_dict()
23
+ new_state_dict = {}
24
+
25
+ print("Removing all '.bias' tensors...")
26
+ removed_count = 0
27
+ for name, tensor in tqdm(source_state_dict.items(), desc="Processing Tensors"):
28
+ if name.endswith(".bias"):
29
+ removed_count += 1
30
+ continue # Skip this tensor
31
+ new_state_dict[name] = tensor
32
+
33
+ print(f"Removed {removed_count} bias tensors.")
34
+
35
+ # We don't need to create a new model from config, as the architecture is
36
+ # a subset of the original. We can load the new state dict with strict=False.
37
+ print("Loading the no-bias state dict back into the model...")
38
+ model.load_state_dict(new_state_dict, strict=False)
39
+
40
+ print(f"Saving the no-bias model and tokenizer to: {output_path}")
41
+ os.makedirs(output_path, exist_ok=True)
42
+ model.save_pretrained(output_path)
43
+ tokenizer.save_pretrained(output_path)
44
+
45
+ print("\nPhase 1b (No-Bias Donor Creation) Complete!")
46
+ print(f"The no-bias donor is ready at '{output_path}'.")
47
+
48
+ if __name__ == "__main__":
49
+ parser = argparse.ArgumentParser(description="Remove bias tensors from a model.")
50
+ parser.add_argument("--source_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The Hugging Face model ID of the source model.")
51
+ parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the no-bias model.")
52
+ args = parser.parse_args()
53
+
54
+ # Example: python remove_biases.py --output_path ./Qwen2.5-72B-Instruct-NoBias
55
+ main(args.source_model, args.output_path)
phase1.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import argparse
4
+ from tqdm import tqdm
5
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
6
+ from accelerate import init_empty_weights # <-- IMPORT THE FIX
7
+
8
+ # --- Configuration ---
9
+ SRC_HIDDEN_SIZE = 5120
10
+ SRC_INTERMEDIATE_SIZE = 25600
11
+ TGT_HIDDEN_SIZE = 8192
12
+ TGT_INTERMEDIATE_SIZE = 29568
13
+
14
+ DELTA_HIDDEN = TGT_HIDDEN_SIZE - SRC_HIDDEN_SIZE
15
+ DELTA_INTERMEDIATE = TGT_INTERMEDIATE_SIZE - SRC_INTERMEDIATE_SIZE
16
+
17
+ # --- Interpolation Functions ---
18
+ def linear_interpolation(block1, block2, weight=0.5):
19
+ return (1 - weight) * block1 + weight * block2
20
+
21
+ def upscale_tensor(tensor: torch.Tensor, name: str) -> torch.Tensor:
22
+ # This function is correct from the previous version, simplified for brevity
23
+ if tensor.ndim == 1:
24
+ if tensor.shape[0] == SRC_HIDDEN_SIZE:
25
+ block1, block2 = tensor[:DELTA_HIDDEN], tensor[-DELTA_HIDDEN:]
26
+ return torch.cat([tensor, linear_interpolation(block1, block2)], dim=0)
27
+ elif tensor.ndim == 2:
28
+ if "embed_tokens" in name or "lm_head" in name:
29
+ if tensor.shape[1] == SRC_HIDDEN_SIZE:
30
+ block1, block2 = tensor[:, :DELTA_HIDDEN], tensor[:, -DELTA_HIDDEN:]
31
+ return torch.cat([tensor, linear_interpolation(block1, block2)], dim=1)
32
+ elif "self_attn" in name:
33
+ if "q_proj.weight" in name or "k_proj.weight" in name or "v_proj.weight" in name:
34
+ block1, block2 = tensor[:, :DELTA_HIDDEN], tensor[:, -DELTA_HIDDEN:]
35
+ return torch.cat([tensor, linear_interpolation(block1, block2)], dim=1)
36
+ elif "o_proj.weight" in name:
37
+ block1, block2 = tensor[:DELTA_HIDDEN, :], tensor[-DELTA_HIDDEN:, :]
38
+ return torch.cat([tensor, linear_interpolation(block1, block2)], dim=0)
39
+ elif "mlp" in name:
40
+ if "gate_proj.weight" in name or "up_proj.weight" in name:
41
+ row_block1, row_block2 = tensor[:DELTA_INTERMEDIATE, :], tensor[-DELTA_INTERMEDIATE:, :]
42
+ upscaled_rows = torch.cat([tensor, linear_interpolation(row_block1, row_block2)], dim=0)
43
+ col_block1, col_block2 = upscaled_rows[:, :DELTA_HIDDEN], upscaled_rows[:, -DELTA_HIDDEN:]
44
+ return torch.cat([upscaled_rows, linear_interpolation(col_block1, col_block2)], dim=1)
45
+ elif "down_proj.weight" in name:
46
+ row_block1, row_block2 = tensor[:DELTA_HIDDEN, :], tensor[-DELTA_HIDDEN:, :]
47
+ upscaled_rows = torch.cat([tensor, linear_interpolation(row_block1, row_block2)], dim=0)
48
+ col_block1, col_block2 = upscaled_rows[:, :DELTA_INTERMEDIATE], upscaled_rows[:, -DELTA_INTERMEDIATE:]
49
+ return torch.cat([upscaled_rows, linear_interpolation(col_block1, col_block2)], dim=1)
50
+ return tensor
51
+
52
+ def run_test_inference(model_path, prompt):
53
+ print("\n" + "="*50)
54
+ print("Running test inference...")
55
+ print("="*50)
56
+
57
+ # Load the newly saved model to ensure it works from disk
58
+ print(f"Loading model from disk: {model_path}")
59
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
60
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
61
+
62
+ print(f"Prompt: \"{prompt}\"")
63
+ device = "cuda" if torch.cuda.is_available() else "cpu"
64
+ print(f"Using device: {device}")
65
+ model.to(device)
66
+
67
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
68
+ outputs = model.generate(**inputs, max_new_tokens=50, do_sample=False)
69
+
70
+ result_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
71
+ print("\n--- Generated Text ---")
72
+ print(result_text)
73
+ print("----------------------")
74
+ print("\nTest inference complete.")
75
+
76
+
77
+ def main(source_model_id, output_path):
78
+ print(f"Loading source model: {source_model_id}")
79
+ # Load on CPU to save VRAM
80
+ source_model = AutoModelForCausalLM.from_pretrained(
81
+ source_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
82
+ )
83
+ tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
84
+
85
+ source_state_dict = source_model.state_dict()
86
+ new_state_dict = {}
87
+
88
+ print("Up-scaling tensors using self-interpolation...")
89
+ for name, tensor in tqdm(source_state_dict.items(), desc="Processing Tensors"):
90
+ new_state_dict[name] = upscale_tensor(tensor.clone(), name)
91
+
92
+ # Free up memory from the source model before creating the new one
93
+ del source_model
94
+ del source_state_dict
95
+
96
+ print("\nCreating new model with target architecture...")
97
+ config = AutoConfig.from_pretrained(source_model_id, trust_remote_code=True)
98
+ config.hidden_size = TGT_HIDDEN_SIZE
99
+ config.intermediate_size = TGT_INTERMEDIATE_SIZE
100
+ config.torch_dtype = torch.bfloat16 # Ensure config specifies the correct dtype
101
+
102
+ # --- THIS IS THE FIX ---
103
+ # Initialize a 'skeleton' model without allocating memory. This is instant.
104
+ print("Step 1: Initializing empty 'skeleton' model on meta device (should be instantaneous)...")
105
+ with init_empty_weights():
106
+ new_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
107
+ print("Empty model created successfully.")
108
+ # ---------------------
109
+
110
+ # The model is now on the 'meta' device. We need to tie the weights to the CPU.
111
+ new_model.tie_weights()
112
+
113
+ print("\nStep 2: Loading up-scaled weights into the new model (this may take time and RAM)...")
114
+ # This step will materialize the weights on the CPU, consuming memory.
115
+ new_model.load_state_dict(new_state_dict, assign=True)
116
+ print("State dict loaded successfully.")
117
+
118
+ print(f"\nStep 3: Saving up-scaled model and tokenizer to: {output_path}")
119
+ os.makedirs(output_path, exist_ok=True)
120
+ new_model.save_pretrained(output_path)
121
+ tokenizer.save_pretrained(output_path)
122
+
123
+ print("\nPhase 1 (Self-Interpolation) Complete!")
124
+ print(f"The up-scaled model is ready at '{output_path}' for use with MergeKit.")
125
+
126
+ # --- Run Test Inference ---
127
+ # We pass the path instead of the model object to test loading from disk
128
+ run_test_inference(output_path, "The rain in Maine ")
129
+
130
+ if __name__ == "__main__":
131
+ parser = argparse.ArgumentParser(description="Upscale Qwen3-32B to hypothetical Qwen3-72B dimensions via self-interpolation.")
132
+ parser.add_argument("--source_model", type=str, default="Qwen/Qwen3-32B", help="The Hugging Face model ID of the source model.")
133
+ parser.add_argument(
134
+ "--output_path", type=str, default="./Qwen3-32B-Upscaled",
135
+ help="The local directory path to save the up-scaled model. (default: ./Qwen3-32B-Upscaled)"
136
+ )
137
+ args = parser.parse_args()
138
+ main(args.source_model, args.output_path)
phase2.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ merge_method: linear
2
+
3
+ base_model: ./Qwen3-32B-Upscaled
4
+ dtype: bfloat16
5
+ slices:
6
+ - merge_method: linear
7
+ sources:
8
+ - model: ./Qwen3-32B-Upscaled
9
+ layer_range: [0, 32]
10
+ parameters:
11
+ weight: 0.5
12
+ - model: ./Qwen2.5-72B-Instruct-Aligned
13
+ layer_range: [0, 32]
14
+ parameters:
15
+ weight: 0.5
16
+ - merge_method: linear
17
+ sources:
18
+ - model: ./Qwen3-32B-Upscaled
19
+ layer_range: [32, 48]
20
+ parameters:
21
+ weight: 0.0
22
+ - model: ./Qwen2.5-72B-Instruct-Aligned
23
+ layer_range: [32, 48]
24
+ parameters:
25
+ weight: 1.0
26
+ - merge_method: linear
27
+ sources:
28
+ - model: ./Qwen3-32B-Upscaled
29
+ layer_range: [32, 64]
30
+ parameters:
31
+ weight: 0.5
32
+ - model: ./Qwen2.5-72B-Instruct-Aligned
33
+ layer_range: [48, 80]
34
+ parameters:
35
+ weight: 0.5
36
+ tokenizer_source: ./Qwen3-32B-Upscaled
prepare_donor.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prepare_donor.py
2
+ import torch
3
+ import os
4
+ import argparse
5
+ from tqdm import tqdm
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+
8
+ def main(source_model_id, output_path):
9
+ """
10
+ Loads a Qwen2.5 model, removes all '.bias' tensors, adds placeholder
11
+ 'q_norm.weight' and 'k_norm.weight' tensors, and saves the result.
12
+ This creates an architecturally compatible donor for a Qwen3 merge.
13
+ """
14
+ print(f"Loading source donor model: {source_model_id}")
15
+ # Load on CPU to save VRAM
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ source_model_id,
18
+ torch_dtype=torch.bfloat16,
19
+ device_map="cpu",
20
+ trust_remote_code=True
21
+ )
22
+ tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
23
+ config = model.config
24
+
25
+ source_state_dict = model.state_dict()
26
+ new_state_dict = {}
27
+
28
+ # --- Part 1: Remove '.bias' tensors ---
29
+ print("Removing all '.bias' tensors...")
30
+ for name, tensor in tqdm(source_state_dict.items(), desc="Filtering Tensors"):
31
+ if not name.endswith(".bias"):
32
+ new_state_dict[name] = tensor
33
+
34
+ # --- Part 2: Add placeholder 'q_norm' and 'k_norm' tensors ---
35
+ print("Adding placeholder 'q_norm' and 'k_norm' tensors...")
36
+ # These norms are 1D vectors of size `head_dim` (128)
37
+ # A value of 1.0 is a standard, neutral initialization for a norm weight.
38
+ norm_dim = config.hidden_size // config.num_attention_heads # Should be 128 for this model
39
+ placeholder_norm = torch.ones(norm_dim, dtype=torch.bfloat16)
40
+
41
+ for i in tqdm(range(config.num_hidden_layers), desc="Adding Norm Tensors"):
42
+ q_norm_name = f"model.layers.{i}.self_attn.q_norm.weight"
43
+ k_norm_name = f"model.layers.{i}.self_attn.k_norm.weight"
44
+ new_state_dict[q_norm_name] = placeholder_norm.clone()
45
+ new_state_dict[k_norm_name] = placeholder_norm.clone()
46
+
47
+ # The original model is a fine container, we just need to load the modified state dict.
48
+ # strict=False is crucial because we have removed and added keys.
49
+ print("Loading the new state dict back into the model shell...")
50
+ model.load_state_dict(new_state_dict, strict=False, assign=True)
51
+
52
+ print(f"Saving the architecturally aligned model to: {output_path}")
53
+ os.makedirs(output_path, exist_ok=True)
54
+ model.save_pretrained(output_path)
55
+ tokenizer.save_pretrained(output_path)
56
+
57
+ print("\nDonor preparation complete!")
58
+ print(f"The aligned donor is ready at '{output_path}'.")
59
+
60
+ if __name__ == "__main__":
61
+ parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
62
+ parser.add_argument("--source_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The Hugging Face model ID of the source model.")
63
+ parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
64
+ args = parser.parse_args()
65
+
66
+ # Example: python prepare_donor.py --output_path ./Qwen2.5-72B-Instruct-Aligned
67
+ main(args.source_model, args.output_path)
prepare_donor_v2.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prepare_donor_v2.py
2
+ import torch
3
+ import os
4
+ import argparse
5
+ from tqdm import tqdm
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ from accelerate import init_empty_weights
8
+
9
+ def main(foundation_model_id, donor_model_id, output_path):
10
+ """
11
+ Creates a new 'Aligned' donor model.
12
+ 1. Defines a target Qwen3 80-layer architecture using the foundation config.
13
+ 2. Creates an empty model 'shell' with this pure Qwen3 architecture.
14
+ 3. Fills the shell with weights from the Qwen2.5 donor, discarding incompatible tensors.
15
+ """
16
+ print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
17
+
18
+ # Load the CONFIG from our Qwen3 foundation to get the correct blueprint
19
+ foundation_config = AutoConfig.from_pretrained(
20
+ foundation_model_id, trust_remote_code=True
21
+ )
22
+
23
+ # Modify the config to match the target 72B size and 80 layers
24
+ foundation_config.num_hidden_layers = 80
25
+ foundation_config.hidden_size = 8192
26
+ foundation_config.intermediate_size = 29568
27
+ foundation_config.torch_dtype = torch.bfloat16
28
+
29
+ # Create an empty 'shell' of the final model. This is instant and memory-efficient.
30
+ # Its config.json will be a pure Qwen3 config.
31
+ print("Creating empty Qwen3 80-layer model shell...")
32
+ with init_empty_weights():
33
+ aligned_model = AutoModelForCausalLM.from_config(
34
+ foundation_config, trust_remote_code=True
35
+ )
36
+ aligned_model.tie_weights()
37
+ print("Empty shell created successfully.")
38
+
39
+ print("\n--- Phase 2: Loading Donor Weights ---")
40
+ print(f"Loading weights from donor: {donor_model_id}")
41
+
42
+ # Load the donor model on CPU to get its state dict
43
+ donor_model = AutoModelForCausalLM.from_pretrained(
44
+ donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
45
+ )
46
+ donor_state_dict = donor_model.state_dict()
47
+ del donor_model # Free memory
48
+
49
+ # We will now load the donor weights into our pure Qwen3 shell.
50
+ # strict=False is essential because the donor has '.bias' tensors that our
51
+ # shell doesn't, and our shell has '.norm' tensors the donor doesn't.
52
+ # This will load all matching weights and ignore the rest.
53
+ print("Loading donor state_dict into the Qwen3 shell (strict=False)...")
54
+ aligned_model.load_state_dict(donor_state_dict, strict=False, assign=True)
55
+
56
+ # The '.norm' weights in the shell will keep their default initialization (1.0),
57
+ # which is exactly what we want for a neutral placeholder.
58
+
59
+ print("\n--- Phase 3: Saving the Aligned Donor ---")
60
+ tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
61
+
62
+ print(f"Saving the architecturally aligned model to: {output_path}")
63
+ os.makedirs(output_path, exist_ok=True)
64
+ aligned_model.save_pretrained(output_path)
65
+ tokenizer.save_pretrained(output_path)
66
+
67
+ print("\nDonor preparation complete! This is the definitive donor model.")
68
+
69
+ if __name__ == "__main__":
70
+ parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
71
+ parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
72
+ parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
73
+ parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
74
+ args = parser.parse_args()
75
+
76
+ main(args.foundation_model, args.donor_model, args.output_path)
prepare_donor_v3.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prepare_donor_v3.py
2
+ import torch
3
+ import os
4
+ import argparse
5
+ from tqdm import tqdm
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ from accelerate import init_empty_weights
8
+
9
+ def main(foundation_model_id, donor_model_id, output_path):
10
+ """
11
+ Creates the definitive 'Aligned' donor model by manually handling all architectural mismatches.
12
+ 1. Defines a target Qwen3 80-layer architecture.
13
+ 2. Creates an empty Qwen3 model 'shell'.
14
+ 3. Manually copies weights from the Qwen2.5 donor, truncating the vocabulary-related
15
+ tensors to fit the Qwen3 architecture.
16
+ """
17
+ print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
18
+
19
+ foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)
20
+
21
+ # Target architecture: 80 layers, 72B dimensions, and Qwen3's vocab size
22
+ target_config = foundation_config
23
+ target_config.num_hidden_layers = 80
24
+ target_config.hidden_size = 8192
25
+ target_config.intermediate_size = 29568
26
+ target_config.vocab_size = 151936 # Explicitly set Qwen3 vocab size
27
+ target_config.torch_dtype = torch.bfloat16
28
+
29
+ print("Creating empty Qwen3 80-layer model shell...")
30
+ with init_empty_weights():
31
+ aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
32
+ aligned_model.tie_weights()
33
+ print("Empty shell created successfully.")
34
+
35
+ print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
36
+ print(f"Loading weights from donor: {donor_model_id}")
37
+
38
+ donor_model = AutoModelForCausalLM.from_pretrained(
39
+ donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
40
+ )
41
+ donor_state_dict = donor_model.state_dict()
42
+ del donor_model
43
+
44
+ # Get the state dict of our target shell to know the correct shapes
45
+ target_state_dict = aligned_model.state_dict()
46
+ new_state_dict = {}
47
+
48
+ print("Copying and aligning tensors one-by-one...")
49
+ for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
50
+ if name in donor_state_dict:
51
+ donor_tensor = donor_state_dict[name]
52
+
53
+ # --- THIS IS THE FIX ---
54
+ # If shapes match, copy directly.
55
+ if donor_tensor.shape == target_tensor.shape:
56
+ new_state_dict[name] = donor_tensor.clone()
57
+ # If shapes mismatch, handle the known vocabulary size difference.
58
+ else:
59
+ print(f" - Resolving shape mismatch for {name}:")
60
+ print(f" Donor shape: {donor_tensor.shape}, Target shape: {target_tensor.shape}")
61
+ # We know the mismatch is on the vocab dimension (dim 0).
62
+ # Truncate the donor tensor to fit the target shape.
63
+ vocab_dim = target_tensor.shape[0]
64
+ new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
65
+ else:
66
+ # This handles tensors that are in the Qwen3 shell but not the Qwen2.5 donor
67
+ # (i.e., q_norm.weight and k_norm.weight). We just keep the initialized value.
68
+ print(f" - Keeping initialized tensor for {name} (not in donor)")
69
+ new_state_dict[name] = target_tensor.clone()
70
+
71
+ print("Loading the fully aligned state_dict into the Qwen3 shell...")
72
+ # This load will now succeed because every tensor has the correct shape.
73
+ aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)
74
+
75
+ print("\n--- Phase 3: Saving the Aligned Donor ---")
76
+ tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
77
+
78
+ print(f"Saving the architecturally aligned model to: {output_path}")
79
+ os.makedirs(output_path, exist_ok=True)
80
+ aligned_model.save_pretrained(output_path)
81
+ tokenizer.save_pretrained(output_path)
82
+
83
+ print("\nDonor preparation complete! This is the definitive donor model.")
84
+
85
+ if __name__ == "__main__":
86
+ parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
87
+ parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
88
+ parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
89
+ parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
90
+ args = parser.parse_args()
91
+
92
+ main(args.foundation_model, args.donor_model, args.output_path)
prepare_donor_v5.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prepare_donor_v5.py
2
+ import torch
3
+ import os
4
+ import argparse
5
+ from tqdm import tqdm
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ from accelerate import init_empty_weights
8
+
9
+ def main(foundation_model_id, donor_model_id, output_path):
10
+ print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
11
+
12
+ foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)
13
+
14
+ target_config = foundation_config
15
+ target_config.num_hidden_layers = 80
16
+ target_config.hidden_size = 8192
17
+ target_config.intermediate_size = 29568
18
+ target_config.vocab_size = 151936
19
+ target_config.torch_dtype = torch.bfloat16
20
+
21
+ print("Creating empty Qwen3 80-layer model shell...")
22
+ with init_empty_weights():
23
+ aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
24
+ # Don't tie weights yet, it can cause sharing issues.
25
+ # aligned_model.tie_weights()
26
+ print("Empty shell created successfully.")
27
+
28
+ print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
29
+ print(f"Loading weights from donor: {donor_model_id}")
30
+
31
+ donor_state_dict = AutoModelForCausalLM.from_pretrained(
32
+ donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
33
+ ).state_dict()
34
+
35
+ target_state_dict = aligned_model.state_dict()
36
+ new_state_dict = {}
37
+
38
+ print("Copying and aligning tensors one-by-one...")
39
+ for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
40
+ if name in donor_state_dict:
41
+ donor_tensor = donor_state_dict[name]
42
+ if donor_tensor.shape == target_tensor.shape:
43
+ new_state_dict[name] = donor_tensor.clone()
44
+ else:
45
+ vocab_dim = target_tensor.shape[0]
46
+ new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
47
+ else:
48
+ # Force a unique copy for every tensor not in the donor.
49
+ new_state_dict[name] = target_tensor.clone()
50
+
51
+ print("Loading the fully aligned state_dict into the Qwen3 shell...")
52
+ aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)
53
+
54
+ # Tie the weights *after* loading the unique tensors.
55
+ aligned_model.tie_weights()
56
+
57
+ print("\n--- Phase 3: Saving the Aligned Donor ---")
58
+ tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
59
+
60
+ print(f"Saving the architecturally aligned model to: {output_path}")
61
+ os.makedirs(output_path, exist_ok=True)
62
+ aligned_model.save_pretrained(output_path)
63
+ tokenizer.save_pretrained(output_path)
64
+
65
+ print("\nDonor preparation complete! This is the definitive donor model.")
66
+
67
+ if __name__ == "__main__":
68
+ parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
69
+ parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
70
+ parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
71
+ parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
72
+ args = parser.parse_args()
73
+
74
+ main(args.foundation_model, args.donor_model, args.output_path)
prepare_donor_v6.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prepare_donor_v6.py
2
+ import torch
3
+ import os
4
+ import argparse
5
+ from tqdm import tqdm
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
+ from accelerate import init_empty_weights
8
+
9
+ def main(foundation_model_id, donor_model_id, output_path):
10
+ """
11
+ Creates the definitive 'Aligned' donor model by manually handling all architectural mismatches.
12
+ This version MANUALLY INSTANTIATES new tensors to defeat memory sharing optimizations.
13
+ """
14
+ print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
15
+
16
+ foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)
17
+
18
+ target_config = foundation_config
19
+ target_config.num_hidden_layers = 80
20
+ target_config.hidden_size = 8192
21
+ target_config.intermediate_size = 29568
22
+ target_config.vocab_size = 151936
23
+ target_config.torch_dtype = torch.bfloat16
24
+
25
+ print("Creating empty Qwen3 80-layer model shell...")
26
+ with init_empty_weights():
27
+ aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
28
+ print("Empty shell created successfully.")
29
+
30
+ print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
31
+ print(f"Loading weights from donor: {donor_model_id}")
32
+
33
+ donor_state_dict = AutoModelForCausalLM.from_pretrained(
34
+ donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
35
+ ).state_dict()
36
+
37
+ target_state_dict = aligned_model.state_dict()
38
+ new_state_dict = {}
39
+
40
+ print("Copying and aligning tensors one-by-one...")
41
+ for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
42
+ if name in donor_state_dict:
43
+ # This logic is for tensors that exist in the donor.
44
+ donor_tensor = donor_state_dict[name]
45
+ if donor_tensor.shape == target_tensor.shape:
46
+ new_state_dict[name] = donor_tensor.clone()
47
+ else: # Vocab mismatch case
48
+ vocab_dim = target_tensor.shape[0]
49
+ new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
50
+ else:
51
+ # --- THIS IS THE FINAL FIX ---
52
+ # This logic is for tensors NOT in the donor (q_norm, k_norm).
53
+ # We will not use the shared `target_tensor`. Instead, we create a new
54
+ # unique tensor of the correct shape and value for each one.
55
+ new_state_dict[name] = torch.ones(target_tensor.shape, dtype=torch.bfloat16)
56
+
57
+ print("Loading the fully aligned state_dict into the Qwen3 shell...")
58
+ aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)
59
+
60
+ # Tie weights *after* all unique tensors are loaded.
61
+ aligned_model.tie_weights()
62
+
63
+ print("\n--- Phase 3: Saving the Aligned Donor ---")
64
+ tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
65
+
66
+ print(f"Saving the architecturally aligned model to: {output_path}")
67
+ os.makedirs(output_path, exist_ok=True)
68
+ aligned_model.save_pretrained(output_path)
69
+ tokenizer.save_pretrained(output_path)
70
+
71
+ print("\nDonor preparation complete! This is the definitive donor model.")
72
+
73
+ if __name__ == "__main__":
74
+ parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
75
+ parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
76
+ parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
77
+ parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
78
+ args = parser.parse_args()
79
+
80
+ main(args.foundation_model, args.donor_model, args.output_path)