ehartford commited on 18 days ago

Commit

7e1725c

verified ·

1 Parent(s): ea3f081

Upload folder using huggingface_hub

Browse files

Files changed (43) hide show

README.md +63 -3
config.json +7 -5
mergekit_config.yml +36 -0
model-00001-of-00031.safetensors +2 -2
model-00002-of-00031.safetensors +2 -2
model-00003-of-00031.safetensors +2 -2
model-00004-of-00031.safetensors +2 -2
model-00005-of-00031.safetensors +2 -2
model-00006-of-00031.safetensors +2 -2
model-00007-of-00031.safetensors +2 -2
model-00008-of-00031.safetensors +1 -1
model-00009-of-00031.safetensors +2 -2
model-00010-of-00031.safetensors +2 -2
model-00011-of-00031.safetensors +2 -2
model-00012-of-00031.safetensors +1 -1
model-00013-of-00031.safetensors +2 -2
model-00014-of-00031.safetensors +2 -2
model-00015-of-00031.safetensors +2 -2
model-00016-of-00031.safetensors +1 -1
model-00017-of-00031.safetensors +2 -2
model-00018-of-00031.safetensors +2 -2
model-00019-of-00031.safetensors +2 -2
model-00020-of-00031.safetensors +1 -1
model-00021-of-00031.safetensors +2 -2
model-00022-of-00031.safetensors +2 -2
model-00023-of-00031.safetensors +2 -2
model-00024-of-00031.safetensors +1 -1
model-00025-of-00031.safetensors +2 -2
model-00026-of-00031.safetensors +2 -2
model-00027-of-00031.safetensors +2 -2
model-00028-of-00031.safetensors +1 -1
model-00029-of-00031.safetensors +2 -2
model-00030-of-00031.safetensors +2 -2
model-00031-of-00031.safetensors +2 -2
model.safetensors.index.json +0 -0
nobias.py +55 -0
phase1.py +138 -0
phase2.yaml +36 -0
prepare_donor.py +67 -0
prepare_donor_v2.py +76 -0
prepare_donor_v3.py +92 -0
prepare_donor_v5.py +74 -0
prepare_donor_v6.py +80 -0

README.md CHANGED Viewed

@@ -1,5 +1,65 @@
-dont get excited
-its not working yet
-still working on it

+---
+base_model: []
+library_name: transformers
+tags:
+- mergekit
+- merge
+---
+# Qwen3-72B-Instruct
+This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
+## Merge Details
+### Merge Method
+This model was merged using the [Linear](https://arxiv.org/abs/2203.05482) merge method using ./Qwen3-32B-Upscaled as a base.
+### Models Merged
+The following models were included in the merge:
+* ./Qwen2.5-72B-Instruct-Aligned
+### Configuration
+The following YAML configuration was used to produce this model:
+```yaml
+merge_method: linear
+base_model: ./Qwen3-32B-Upscaled
+dtype: bfloat16
+slices:
+  - merge_method: linear
+    sources:
+    - model: ./Qwen3-32B-Upscaled
+      layer_range: [0, 32]
+      parameters:
+        weight: 0.5
+    - model: ./Qwen2.5-72B-Instruct-Aligned
+      layer_range: [0, 32]
+      parameters:
+        weight: 0.5
+  - merge_method: linear
+    sources:
+    - model: ./Qwen3-32B-Upscaled
+      layer_range: [32, 48]
+      parameters:
+        weight: 0.0
+    - model: ./Qwen2.5-72B-Instruct-Aligned
+      layer_range: [32, 48]
+      parameters:
+        weight: 1.0
+  - merge_method: linear
+    sources:
+    - model: ./Qwen3-32B-Upscaled
+      layer_range: [32, 64]
+      parameters:
+        weight: 0.5
+    - model: ./Qwen2.5-72B-Instruct-Aligned
+      layer_range: [48, 80]
+      parameters:
+        weight: 0.5
+tokenizer_source: ./Qwen3-32B-Upscaled
+```

config.json CHANGED Viewed

@@ -4,13 +4,15 @@
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "head_dim": 128,
   "hidden_act": "silu",
   "hidden_size": 8192,
   "initializer_range": 0.02,
   "intermediate_size": 29568,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 70,
   "model_type": "qwen3",
   "num_attention_heads": 64,
   "num_hidden_layers": 80,
@@ -18,11 +20,11 @@
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 1000000,
-  "sliding_window": 131072,
-  "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.51.3",
   "use_cache": true,
   "use_sliding_window": false,
-  "vocab_size": 151936
 }

   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
   "head_dim": 128,
   "hidden_act": "silu",
   "hidden_size": 8192,
   "initializer_range": 0.02,
   "intermediate_size": 29568,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 64,
   "model_type": "qwen3",
   "num_attention_heads": 64,
   "num_hidden_layers": 80,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.51.3",
   "use_cache": true,
   "use_sliding_window": false,
+  "vocab_size": 151669
 }

mergekit_config.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+merge_method: linear
+base_model: ./Qwen3-32B-Upscaled
+dtype: bfloat16
+slices:
+  - merge_method: linear
+    sources:
+    - model: ./Qwen3-32B-Upscaled
+      layer_range: [0, 32]
+      parameters:
+        weight: 0.5
+    - model: ./Qwen2.5-72B-Instruct-Aligned
+      layer_range: [0, 32]
+      parameters:
+        weight: 0.5
+  - merge_method: linear
+    sources:
+    - model: ./Qwen3-32B-Upscaled
+      layer_range: [32, 48]
+      parameters:
+        weight: 0.0
+    - model: ./Qwen2.5-72B-Instruct-Aligned
+      layer_range: [32, 48]
+      parameters:
+        weight: 1.0
+  - merge_method: linear
+    sources:
+    - model: ./Qwen3-32B-Upscaled
+      layer_range: [32, 64]
+      parameters:
+        weight: 0.5
+    - model: ./Qwen2.5-72B-Instruct-Aligned
+      layer_range: [48, 80]
+      parameters:
+        weight: 0.5
+tokenizer_source: ./Qwen3-32B-Upscaled

model-00001-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6ebe119ba44e78c9050dbab69114c8f4834181422f92407f335be99b3bfed912
-size 4546661424

 version https://git-lfs.github.com/spec/v1
+oid sha256:35fbeb0f8b3cc326f8c6e7f4f1747c84381c709ea16de9e7e67fb1136fd57baa
+size 4969906520

model-00002-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ddddaf2fe9205022e521f60e4cfa6d23c7dbd2e4c3b1f013922eee3b4402a3f
-size 4964061232

 version https://git-lfs.github.com/spec/v1
+oid sha256:b85466d7bc425ed4e9295b56d5c11a9f42750c9ba40b68a11e923397303518b2
+size 4980822456

model-00003-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0bc9ec6b8cc2f15b94365c6824047bcac3538e70c814fb3c74f314ed49dc2095
-size 4781577096

 version https://git-lfs.github.com/spec/v1
+oid sha256:3a05b876c8eba435b5fe793df62beba4048c806a5dbfb74225adb98542e8da9c
+size 4764815928

model-00004-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e0088c1b4df8a82118246d2d5d7847551e3990513b25ecabcea9a944cb1516d
-size 4781610096

 version https://git-lfs.github.com/spec/v1
+oid sha256:0d8b129f358836e1449ef24ef2d1a0d05cc368ff20650246d3328f5d3517b024
+size 4781610128

model-00005-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db62306ab03ca84de2789e5c1db2f954ce1a6b13784940305a593d7c0203a598
-size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:57a2d8351dba42247ffe685803eddba72fa392732a21afb5a81d88a21e901207
+size 4781610136

model-00006-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:720b31a361a12aa6fea6449fb90f312b4156b97c7ac63afcdb39c21584a39867
-size 4964061264

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fc7d5d413012b176cf9884cf32889ecb7e358995d6b8353176ff7adf625c9fe
+size 4980822464

model-00007-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:97545eafa4a41817faea3a9471e73c5be245ea29e10cae7fd11736d749c2e285
-size 4781577128

 version https://git-lfs.github.com/spec/v1
+oid sha256:99d7fc43aa7b55c000c5df01672bce4e05eba43e6f9a80cad47fef843663ef46
+size 4764815928

model-00008-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a6d96fa787d0fb1eba7e84af0ae6f78301552b4676c79f010ba59c8c9667d30
 size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ef65e8e5f342e8747cad3e210c936469b381f1fb3215dea036ac4a083be857d
 size 4781610128

model-00009-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:502e80229011cefa3fe391e2b8b500656fae9692aa841846759ee258565c7508
-size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:4989ca0f424300e14cf500c4e1f22d83e99778b250ef6e70165a9f1d86d6cb8c
+size 4781610136

model-00010-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:753bca5be6b1f6e0042b5f42a33caff3936ad4d435b89bb9894635f65629c208
-size 4964061264

 version https://git-lfs.github.com/spec/v1
+oid sha256:3faf79baba0d399730c353d423776e1033687a8eccde788c5065f407ed3515a0
+size 4980822464

model-00011-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f428d399a792b9c179d8845bef256441696ed64a8c0e20880f4d62f4faf2515b
-size 4781577128

 version https://git-lfs.github.com/spec/v1
+oid sha256:53105aa13fa33194ca4c95b1ba4d12d05a47018d5e44aafa292571bba166ade9
+size 4764815928

model-00012-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:78197a393251db752c94812c92f781c338ead23cce2a971b79a5e18139c95196
 size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ffe444539746c03a2a4ad5bf8d2d3eb67a3f46309e6789e2fa654ec0f0f8436
 size 4781610128

model-00013-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:86330a1824a777c2105e6fe19c81b11e17e63446726f2c138b34dc1de5999493
-size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:480be5925f312b1704aeec6d09328360f50a5cb495ee8a8231677f0a76d20a22
+size 4781610136

model-00014-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3362dc363af862d300f443002d16e29b3a84ac794e1c711f86e2fe25ca973d69
-size 4964061264

 version https://git-lfs.github.com/spec/v1
+oid sha256:a7cc5b212b3a9fae6e8548e22807366fe41d8c5a3f04278e1f12bc2043c9eb63
+size 4980822464

model-00015-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3e4bd9b0f0b87fd3d8c71dcbf357aaf7a656d9d9c851a1e2d93f9542567a957
-size 4781577128

 version https://git-lfs.github.com/spec/v1
+oid sha256:b6cc80add3c83efda7d085964764fd22d5099bc51368c8833440e04ea429ca7d
+size 4764815928

model-00016-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b446fa6f25cdb304c18d060653a75638b2030dae18e6b705c9365beb260a271
 size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:ee8fbe0b58d99f6654e80f11d69f8239c60e8e4ee760b4601fa5d2692ec5f46f
 size 4781610128

model-00017-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:281a9dcbb30b975a56b26d5d37fbfaf82a6d57fad6be5362c26d8a5f9759572e
-size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:2c163c04228a439b75e0b469b803079d43a108ad8bb81e35506765c5b252175f
+size 4781610136

model-00018-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c79648f7404a3ec396dc0f3a318b9910ae97b64735a53505a2232be960fef15
-size 4964061264

 version https://git-lfs.github.com/spec/v1
+oid sha256:b61add16c1758adefb8191307e55469f0cf1158b3c3f8841f96698a6b94b777b
+size 4980822464

model-00019-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4141b7e5b83b531706dc7850ba99ea92ede5e45835b9b82f911c39a88bee4c30
-size 4781577128

 version https://git-lfs.github.com/spec/v1
+oid sha256:c386662afbb04595217e683f31a62be86a8c52680c592e81d0f8cb5be7b503f6
+size 4764815928

model-00020-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:549c338b690fac9f20dbd51166e9d7ea4637c949f79ea213217668a9bbfa34ab
 size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:f79268be94b6db9da1ec22a1eb230c0894f2ae7dba79860af5a4795f2fa5e4a7
 size 4781610128

model-00021-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9fe307829e356fc6ecead37cdad243cf3fdd832f92219e1c2d77053b4b975d26
-size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:1a5312b78c7d05fbebd78dc5f7a10d7639fe3729f38f8a97de5b07706db19bd2
+size 4781610136

model-00022-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33ba2a5a0633406992b0931fa97848ac04b0e744110b21e75da9422cd4e51ec8
-size 4964061264

 version https://git-lfs.github.com/spec/v1
+oid sha256:3193ab90c1d8350b500e83bd2e7948c29a68dcbeb54e12bb50ca5c610ebbaa62
+size 4980822464

model-00023-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c338036e51aac0f2aa8810ffab9392dbba87e9d5d56b1ac641e2379cb1046b97
-size 4781577128

 version https://git-lfs.github.com/spec/v1
+oid sha256:ac46a9298e7f5a0578217c6eccff85fa354965506a22908c6c8aa50cd38cd8e1
+size 4764815928

model-00024-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d68a1e351fbc3074f58b0bb6014f485a76796487400aae5b6540b5c24daf326a
 size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:e37ff960808f0c0753925e5d6cece3cbeeca5aebec1cd110e1c134eae63dc665
 size 4781610128

model-00025-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c50e4f2b171237132b0349359974f26d9c6f3d03361046f5c260d22162ae2ac7
-size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:c4b20bee0cbc6f893d75c6beb4b82411361150025050047e0f9f09ddb446aada
+size 4781610136

model-00026-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc179221ee6a2038c1def8fd8bc521bd5b8a8f42fae45c18d84fcab6075a976e
-size 4964061264

 version https://git-lfs.github.com/spec/v1
+oid sha256:467f58dafdd9fa9dc3c51be48e6f097714c0d87866cc6b85a2d7ce5c820178a8
+size 4980822464

model-00027-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fcd6b1e73f0311bc965e0c2be822891a59c4ddc9979cb25edfcbb6753045e4b
-size 4781577128

 version https://git-lfs.github.com/spec/v1
+oid sha256:791e4dec823afbb3fbc4228704df1c15ed89c91b5e82e2526578ac9144293e74
+size 4764815928

model-00028-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fcfb6e484c433364cb232a8755a6b37c83b8cd6e9a29f35af8686895cd349ba
 size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:6216bab7883e6bd27a487443253f8bcec6de9c46c73f65101a1c1b8ac541ee85
 size 4781610128

model-00029-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e947e91e2494dd122a5cac3a3dbe4c5ac4bcb82ddd36e8cc99c3c69e95891e2f
-size 4781610128

 version https://git-lfs.github.com/spec/v1
+oid sha256:0481518f9a06b33a06cac2cea65b01afda9c7393278713754d1bcd6b7f017c2e
+size 4781610136

model-00030-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e4db59843cff2e9ec0b2b70a5ec1a3e2ccb9c9380d24e0e0bbb9592a437b067
-size 3208726960

 version https://git-lfs.github.com/spec/v1
+oid sha256:dd8d12ed2fe337c6963b5c5f3229fb3b3b9a8c90701185c645649fd96dd6442f
+size 4980822456

model-00031-of-00031.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ac450e7a68b764065b7dde03e3e7e54d64a5fb972567881ff144ce4a7c64220
-size 2489319552

 version https://git-lfs.github.com/spec/v1
+oid sha256:289f1c97a620d890d159d0cd21c44db07345bb47a0df6b35ee1e9f4159deb536
+size 285229888

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

nobias.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# remove_biases.py
+import torch
+import os
+import argparse
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def main(source_model_id, output_path):
+    """
+    Loads a model, removes all tensors ending in '.bias', and saves the result.
+    """
+    print(f"Loading source donor model: {source_model_id}")
+    # Load on CPU to avoid using VRAM
+    model = AutoModelForCausalLM.from_pretrained(
+        source_model_id,
+        torch_dtype=torch.bfloat16,
+        device_map="cpu",
+        trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
+    source_state_dict = model.state_dict()
+    new_state_dict = {}
+    print("Removing all '.bias' tensors...")
+    removed_count = 0
+    for name, tensor in tqdm(source_state_dict.items(), desc="Processing Tensors"):
+        if name.endswith(".bias"):
+            removed_count += 1
+            continue  # Skip this tensor
+        new_state_dict[name] = tensor
+    print(f"Removed {removed_count} bias tensors.")
+    # We don't need to create a new model from config, as the architecture is
+    # a subset of the original. We can load the new state dict with strict=False.
+    print("Loading the no-bias state dict back into the model...")
+    model.load_state_dict(new_state_dict, strict=False)
+    print(f"Saving the no-bias model and tokenizer to: {output_path}")
+    os.makedirs(output_path, exist_ok=True)
+    model.save_pretrained(output_path)
+    tokenizer.save_pretrained(output_path)
+    print("\nPhase 1b (No-Bias Donor Creation) Complete!")
+    print(f"The no-bias donor is ready at '{output_path}'.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Remove bias tensors from a model.")
+    parser.add_argument("--source_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The Hugging Face model ID of the source model.")
+    parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the no-bias model.")
+    args = parser.parse_args()
+    # Example: python remove_biases.py --output_path ./Qwen2.5-72B-Instruct-NoBias
+    main(args.source_model, args.output_path)

phase1.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+import os
+import argparse
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from accelerate import init_empty_weights # <-- IMPORT THE FIX
+# --- Configuration ---
+SRC_HIDDEN_SIZE = 5120
+SRC_INTERMEDIATE_SIZE = 25600
+TGT_HIDDEN_SIZE = 8192
+TGT_INTERMEDIATE_SIZE = 29568
+DELTA_HIDDEN = TGT_HIDDEN_SIZE - SRC_HIDDEN_SIZE
+DELTA_INTERMEDIATE = TGT_INTERMEDIATE_SIZE - SRC_INTERMEDIATE_SIZE
+# --- Interpolation Functions ---
+def linear_interpolation(block1, block2, weight=0.5):
+    return (1 - weight) * block1 + weight * block2
+def upscale_tensor(tensor: torch.Tensor, name: str) -> torch.Tensor:
+    # This function is correct from the previous version, simplified for brevity
+    if tensor.ndim == 1:
+        if tensor.shape[0] == SRC_HIDDEN_SIZE:
+            block1, block2 = tensor[:DELTA_HIDDEN], tensor[-DELTA_HIDDEN:]
+            return torch.cat([tensor, linear_interpolation(block1, block2)], dim=0)
+    elif tensor.ndim == 2:
+        if "embed_tokens" in name or "lm_head" in name:
+            if tensor.shape[1] == SRC_HIDDEN_SIZE:
+                block1, block2 = tensor[:, :DELTA_HIDDEN], tensor[:, -DELTA_HIDDEN:]
+                return torch.cat([tensor, linear_interpolation(block1, block2)], dim=1)
+        elif "self_attn" in name:
+            if "q_proj.weight" in name or "k_proj.weight" in name or "v_proj.weight" in name:
+                block1, block2 = tensor[:, :DELTA_HIDDEN], tensor[:, -DELTA_HIDDEN:]
+                return torch.cat([tensor, linear_interpolation(block1, block2)], dim=1)
+            elif "o_proj.weight" in name:
+                block1, block2 = tensor[:DELTA_HIDDEN, :], tensor[-DELTA_HIDDEN:, :]
+                return torch.cat([tensor, linear_interpolation(block1, block2)], dim=0)
+        elif "mlp" in name:
+            if "gate_proj.weight" in name or "up_proj.weight" in name:
+                row_block1, row_block2 = tensor[:DELTA_INTERMEDIATE, :], tensor[-DELTA_INTERMEDIATE:, :]
+                upscaled_rows = torch.cat([tensor, linear_interpolation(row_block1, row_block2)], dim=0)
+                col_block1, col_block2 = upscaled_rows[:, :DELTA_HIDDEN], upscaled_rows[:, -DELTA_HIDDEN:]
+                return torch.cat([upscaled_rows, linear_interpolation(col_block1, col_block2)], dim=1)
+            elif "down_proj.weight" in name:
+                row_block1, row_block2 = tensor[:DELTA_HIDDEN, :], tensor[-DELTA_HIDDEN:, :]
+                upscaled_rows = torch.cat([tensor, linear_interpolation(row_block1, row_block2)], dim=0)
+                col_block1, col_block2 = upscaled_rows[:, :DELTA_INTERMEDIATE], upscaled_rows[:, -DELTA_INTERMEDIATE:]
+                return torch.cat([upscaled_rows, linear_interpolation(col_block1, col_block2)], dim=1)
+    return tensor
+def run_test_inference(model_path, prompt):
+    print("\n" + "="*50)
+    print("Running test inference...")
+    print("="*50)
+    # Load the newly saved model to ensure it works from disk
+    print(f"Loading model from disk: {model_path}")
+    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    print(f"Prompt: \"{prompt}\"")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    model.to(device)
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    outputs = model.generate(**inputs, max_new_tokens=50, do_sample=False)
+    result_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+    print("\n--- Generated Text ---")
+    print(result_text)
+    print("----------------------")
+    print("\nTest inference complete.")
+def main(source_model_id, output_path):
+    print(f"Loading source model: {source_model_id}")
+    # Load on CPU to save VRAM
+    source_model = AutoModelForCausalLM.from_pretrained(
+        source_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
+    source_state_dict = source_model.state_dict()
+    new_state_dict = {}
+    print("Up-scaling tensors using self-interpolation...")
+    for name, tensor in tqdm(source_state_dict.items(), desc="Processing Tensors"):
+        new_state_dict[name] = upscale_tensor(tensor.clone(), name)
+    # Free up memory from the source model before creating the new one
+    del source_model
+    del source_state_dict
+    print("\nCreating new model with target architecture...")
+    config = AutoConfig.from_pretrained(source_model_id, trust_remote_code=True)
+    config.hidden_size = TGT_HIDDEN_SIZE
+    config.intermediate_size = TGT_INTERMEDIATE_SIZE
+    config.torch_dtype = torch.bfloat16 # Ensure config specifies the correct dtype
+    # --- THIS IS THE FIX ---
+    # Initialize a 'skeleton' model without allocating memory. This is instant.
+    print("Step 1: Initializing empty 'skeleton' model on meta device (should be instantaneous)...")
+    with init_empty_weights():
+        new_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+    print("Empty model created successfully.")
+    # ---------------------
+    # The model is now on the 'meta' device. We need to tie the weights to the CPU.
+    new_model.tie_weights()
+    print("\nStep 2: Loading up-scaled weights into the new model (this may take time and RAM)...")
+    # This step will materialize the weights on the CPU, consuming memory.
+    new_model.load_state_dict(new_state_dict, assign=True)
+    print("State dict loaded successfully.")
+    print(f"\nStep 3: Saving up-scaled model and tokenizer to: {output_path}")
+    os.makedirs(output_path, exist_ok=True)
+    new_model.save_pretrained(output_path)
+    tokenizer.save_pretrained(output_path)
+    print("\nPhase 1 (Self-Interpolation) Complete!")
+    print(f"The up-scaled model is ready at '{output_path}' for use with MergeKit.")
+    # --- Run Test Inference ---
+    # We pass the path instead of the model object to test loading from disk
+    run_test_inference(output_path, "The rain in Maine ")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Upscale Qwen3-32B to hypothetical Qwen3-72B dimensions via self-interpolation.")
+    parser.add_argument("--source_model", type=str, default="Qwen/Qwen3-32B", help="The Hugging Face model ID of the source model.")
+    parser.add_argument(
+        "--output_path", type=str, default="./Qwen3-32B-Upscaled",
+        help="The local directory path to save the up-scaled model. (default: ./Qwen3-32B-Upscaled)"
+    )
+    args = parser.parse_args()
+    main(args.source_model, args.output_path)

phase2.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+merge_method: linear
+base_model: ./Qwen3-32B-Upscaled
+dtype: bfloat16
+slices:
+  - merge_method: linear
+    sources:
+    - model: ./Qwen3-32B-Upscaled
+      layer_range: [0, 32]
+      parameters:
+        weight: 0.5
+    - model: ./Qwen2.5-72B-Instruct-Aligned
+      layer_range: [0, 32]
+      parameters:
+        weight: 0.5
+  - merge_method: linear
+    sources:
+    - model: ./Qwen3-32B-Upscaled
+      layer_range: [32, 48]
+      parameters:
+        weight: 0.0
+    - model: ./Qwen2.5-72B-Instruct-Aligned
+      layer_range: [32, 48]
+      parameters:
+        weight: 1.0
+  - merge_method: linear
+    sources:
+    - model: ./Qwen3-32B-Upscaled
+      layer_range: [32, 64]
+      parameters:
+        weight: 0.5
+    - model: ./Qwen2.5-72B-Instruct-Aligned
+      layer_range: [48, 80]
+      parameters:
+        weight: 0.5
+tokenizer_source: ./Qwen3-32B-Upscaled

prepare_donor.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# prepare_donor.py
+import torch
+import os
+import argparse
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+def main(source_model_id, output_path):
+    """
+    Loads a Qwen2.5 model, removes all '.bias' tensors, adds placeholder
+    'q_norm.weight' and 'k_norm.weight' tensors, and saves the result.
+    This creates an architecturally compatible donor for a Qwen3 merge.
+    """
+    print(f"Loading source donor model: {source_model_id}")
+    # Load on CPU to save VRAM
+    model = AutoModelForCausalLM.from_pretrained(
+        source_model_id,
+        torch_dtype=torch.bfloat16,
+        device_map="cpu",
+        trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
+    config = model.config
+    source_state_dict = model.state_dict()
+    new_state_dict = {}
+    # --- Part 1: Remove '.bias' tensors ---
+    print("Removing all '.bias' tensors...")
+    for name, tensor in tqdm(source_state_dict.items(), desc="Filtering Tensors"):
+        if not name.endswith(".bias"):
+            new_state_dict[name] = tensor
+    # --- Part 2: Add placeholder 'q_norm' and 'k_norm' tensors ---
+    print("Adding placeholder 'q_norm' and 'k_norm' tensors...")
+    # These norms are 1D vectors of size `head_dim` (128)
+    # A value of 1.0 is a standard, neutral initialization for a norm weight.
+    norm_dim = config.hidden_size // config.num_attention_heads # Should be 128 for this model
+    placeholder_norm = torch.ones(norm_dim, dtype=torch.bfloat16)
+    for i in tqdm(range(config.num_hidden_layers), desc="Adding Norm Tensors"):
+        q_norm_name = f"model.layers.{i}.self_attn.q_norm.weight"
+        k_norm_name = f"model.layers.{i}.self_attn.k_norm.weight"
+        new_state_dict[q_norm_name] = placeholder_norm.clone()
+        new_state_dict[k_norm_name] = placeholder_norm.clone()
+    # The original model is a fine container, we just need to load the modified state dict.
+    # strict=False is crucial because we have removed and added keys.
+    print("Loading the new state dict back into the model shell...")
+    model.load_state_dict(new_state_dict, strict=False, assign=True)
+    print(f"Saving the architecturally aligned model to: {output_path}")
+    os.makedirs(output_path, exist_ok=True)
+    model.save_pretrained(output_path)
+    tokenizer.save_pretrained(output_path)
+    print("\nDonor preparation complete!")
+    print(f"The aligned donor is ready at '{output_path}'.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
+    parser.add_argument("--source_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The Hugging Face model ID of the source model.")
+    parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
+    args = parser.parse_args()
+    # Example: python prepare_donor.py --output_path ./Qwen2.5-72B-Instruct-Aligned
+    main(args.source_model, args.output_path)

prepare_donor_v2.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# prepare_donor_v2.py
+import torch
+import os
+import argparse
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from accelerate import init_empty_weights
+def main(foundation_model_id, donor_model_id, output_path):
+    """
+    Creates a new 'Aligned' donor model.
+    1. Defines a target Qwen3 80-layer architecture using the foundation config.
+    2. Creates an empty model 'shell' with this pure Qwen3 architecture.
+    3. Fills the shell with weights from the Qwen2.5 donor, discarding incompatible tensors.
+    """
+    print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
+    # Load the CONFIG from our Qwen3 foundation to get the correct blueprint
+    foundation_config = AutoConfig.from_pretrained(
+        foundation_model_id, trust_remote_code=True
+    )
+    # Modify the config to match the target 72B size and 80 layers
+    foundation_config.num_hidden_layers = 80
+    foundation_config.hidden_size = 8192
+    foundation_config.intermediate_size = 29568
+    foundation_config.torch_dtype = torch.bfloat16
+    # Create an empty 'shell' of the final model. This is instant and memory-efficient.
+    # Its config.json will be a pure Qwen3 config.
+    print("Creating empty Qwen3 80-layer model shell...")
+    with init_empty_weights():
+        aligned_model = AutoModelForCausalLM.from_config(
+            foundation_config, trust_remote_code=True
+        )
+    aligned_model.tie_weights()
+    print("Empty shell created successfully.")
+    print("\n--- Phase 2: Loading Donor Weights ---")
+    print(f"Loading weights from donor: {donor_model_id}")
+    # Load the donor model on CPU to get its state dict
+    donor_model = AutoModelForCausalLM.from_pretrained(
+        donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
+    )
+    donor_state_dict = donor_model.state_dict()
+    del donor_model # Free memory
+    # We will now load the donor weights into our pure Qwen3 shell.
+    # strict=False is essential because the donor has '.bias' tensors that our
+    # shell doesn't, and our shell has '.norm' tensors the donor doesn't.
+    # This will load all matching weights and ignore the rest.
+    print("Loading donor state_dict into the Qwen3 shell (strict=False)...")
+    aligned_model.load_state_dict(donor_state_dict, strict=False, assign=True)
+    # The '.norm' weights in the shell will keep their default initialization (1.0),
+    # which is exactly what we want for a neutral placeholder.
+    print("\n--- Phase 3: Saving the Aligned Donor ---")
+    tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
+    print(f"Saving the architecturally aligned model to: {output_path}")
+    os.makedirs(output_path, exist_ok=True)
+    aligned_model.save_pretrained(output_path)
+    tokenizer.save_pretrained(output_path)
+    print("\nDonor preparation complete! This is the definitive donor model.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
+    parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
+    parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
+    parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
+    args = parser.parse_args()
+    main(args.foundation_model, args.donor_model, args.output_path)

prepare_donor_v3.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# prepare_donor_v3.py
+import torch
+import os
+import argparse
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from accelerate import init_empty_weights
+def main(foundation_model_id, donor_model_id, output_path):
+    """
+    Creates the definitive 'Aligned' donor model by manually handling all architectural mismatches.
+    1. Defines a target Qwen3 80-layer architecture.
+    2. Creates an empty Qwen3 model 'shell'.
+    3. Manually copies weights from the Qwen2.5 donor, truncating the vocabulary-related
+       tensors to fit the Qwen3 architecture.
+    """
+    print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
+    foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)
+    # Target architecture: 80 layers, 72B dimensions, and Qwen3's vocab size
+    target_config = foundation_config
+    target_config.num_hidden_layers = 80
+    target_config.hidden_size = 8192
+    target_config.intermediate_size = 29568
+    target_config.vocab_size = 151936 # Explicitly set Qwen3 vocab size
+    target_config.torch_dtype = torch.bfloat16
+    print("Creating empty Qwen3 80-layer model shell...")
+    with init_empty_weights():
+        aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
+    aligned_model.tie_weights()
+    print("Empty shell created successfully.")
+    print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
+    print(f"Loading weights from donor: {donor_model_id}")
+    donor_model = AutoModelForCausalLM.from_pretrained(
+        donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
+    )
+    donor_state_dict = donor_model.state_dict()
+    del donor_model
+    # Get the state dict of our target shell to know the correct shapes
+    target_state_dict = aligned_model.state_dict()
+    new_state_dict = {}
+    print("Copying and aligning tensors one-by-one...")
+    for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
+        if name in donor_state_dict:
+            donor_tensor = donor_state_dict[name]
+            # --- THIS IS THE FIX ---
+            # If shapes match, copy directly.
+            if donor_tensor.shape == target_tensor.shape:
+                new_state_dict[name] = donor_tensor.clone()
+            # If shapes mismatch, handle the known vocabulary size difference.
+            else:
+                print(f"  - Resolving shape mismatch for {name}:")
+                print(f"    Donor shape: {donor_tensor.shape}, Target shape: {target_tensor.shape}")
+                # We know the mismatch is on the vocab dimension (dim 0).
+                # Truncate the donor tensor to fit the target shape.
+                vocab_dim = target_tensor.shape[0]
+                new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
+        else:
+            # This handles tensors that are in the Qwen3 shell but not the Qwen2.5 donor
+            # (i.e., q_norm.weight and k_norm.weight). We just keep the initialized value.
+            print(f"  - Keeping initialized tensor for {name} (not in donor)")
+            new_state_dict[name] = target_tensor.clone()
+    print("Loading the fully aligned state_dict into the Qwen3 shell...")
+    # This load will now succeed because every tensor has the correct shape.
+    aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)
+    print("\n--- Phase 3: Saving the Aligned Donor ---")
+    tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
+    print(f"Saving the architecturally aligned model to: {output_path}")
+    os.makedirs(output_path, exist_ok=True)
+    aligned_model.save_pretrained(output_path)
+    tokenizer.save_pretrained(output_path)
+    print("\nDonor preparation complete! This is the definitive donor model.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
+    parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
+    parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
+    parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
+    args = parser.parse_args()
+    main(args.foundation_model, args.donor_model, args.output_path)

prepare_donor_v5.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# prepare_donor_v5.py
+import torch
+import os
+import argparse
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from accelerate import init_empty_weights
+def main(foundation_model_id, donor_model_id, output_path):
+    print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
+    foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)
+    target_config = foundation_config
+    target_config.num_hidden_layers = 80
+    target_config.hidden_size = 8192
+    target_config.intermediate_size = 29568
+    target_config.vocab_size = 151936
+    target_config.torch_dtype = torch.bfloat16
+    print("Creating empty Qwen3 80-layer model shell...")
+    with init_empty_weights():
+        aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
+    # Don't tie weights yet, it can cause sharing issues.
+    # aligned_model.tie_weights()
+    print("Empty shell created successfully.")
+    print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
+    print(f"Loading weights from donor: {donor_model_id}")
+    donor_state_dict = AutoModelForCausalLM.from_pretrained(
+        donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
+    ).state_dict()
+    target_state_dict = aligned_model.state_dict()
+    new_state_dict = {}
+    print("Copying and aligning tensors one-by-one...")
+    for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
+        if name in donor_state_dict:
+            donor_tensor = donor_state_dict[name]
+            if donor_tensor.shape == target_tensor.shape:
+                new_state_dict[name] = donor_tensor.clone()
+            else:
+                vocab_dim = target_tensor.shape[0]
+                new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
+        else:
+            # Force a unique copy for every tensor not in the donor.
+            new_state_dict[name] = target_tensor.clone()
+    print("Loading the fully aligned state_dict into the Qwen3 shell...")
+    aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)
+    # Tie the weights *after* loading the unique tensors.
+    aligned_model.tie_weights()
+    print("\n--- Phase 3: Saving the Aligned Donor ---")
+    tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
+    print(f"Saving the architecturally aligned model to: {output_path}")
+    os.makedirs(output_path, exist_ok=True)
+    aligned_model.save_pretrained(output_path)
+    tokenizer.save_pretrained(output_path)
+    print("\nDonor preparation complete! This is the definitive donor model.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
+    parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
+    parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
+    parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
+    args = parser.parse_args()
+    main(args.foundation_model, args.donor_model, args.output_path)

prepare_donor_v6.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# prepare_donor_v6.py
+import torch
+import os
+import argparse
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from accelerate import init_empty_weights
+def main(foundation_model_id, donor_model_id, output_path):
+    """
+    Creates the definitive 'Aligned' donor model by manually handling all architectural mismatches.
+    This version MANUALLY INSTANTIATES new tensors to defeat memory sharing optimizations.
+    """
+    print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---")
+    foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True)
+    target_config = foundation_config
+    target_config.num_hidden_layers = 80
+    target_config.hidden_size = 8192
+    target_config.intermediate_size = 29568
+    target_config.vocab_size = 151936
+    target_config.torch_dtype = torch.bfloat16
+    print("Creating empty Qwen3 80-layer model shell...")
+    with init_empty_weights():
+        aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True)
+    print("Empty shell created successfully.")
+    print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---")
+    print(f"Loading weights from donor: {donor_model_id}")
+    donor_state_dict = AutoModelForCausalLM.from_pretrained(
+        donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
+    ).state_dict()
+    target_state_dict = aligned_model.state_dict()
+    new_state_dict = {}
+    print("Copying and aligning tensors one-by-one...")
+    for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"):
+        if name in donor_state_dict:
+            # This logic is for tensors that exist in the donor.
+            donor_tensor = donor_state_dict[name]
+            if donor_tensor.shape == target_tensor.shape:
+                new_state_dict[name] = donor_tensor.clone()
+            else: # Vocab mismatch case
+                vocab_dim = target_tensor.shape[0]
+                new_state_dict[name] = donor_tensor[:vocab_dim, :].clone()
+        else:
+            # --- THIS IS THE FINAL FIX ---
+            # This logic is for tensors NOT in the donor (q_norm, k_norm).
+            # We will not use the shared `target_tensor`. Instead, we create a new
+            # unique tensor of the correct shape and value for each one.
+            new_state_dict[name] = torch.ones(target_tensor.shape, dtype=torch.bfloat16)
+    print("Loading the fully aligned state_dict into the Qwen3 shell...")
+    aligned_model.load_state_dict(new_state_dict, strict=True, assign=True)
+    # Tie weights *after* all unique tensors are loaded.
+    aligned_model.tie_weights()
+    print("\n--- Phase 3: Saving the Aligned Donor ---")
+    tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True)
+    print(f"Saving the architecturally aligned model to: {output_path}")
+    os.makedirs(output_path, exist_ok=True)
+    aligned_model.save_pretrained(output_path)
+    tokenizer.save_pretrained(output_path)
+    print("\nDonor preparation complete! This is the definitive donor model.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.")
+    parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.")
+    parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.")
+    parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.")
+    args = parser.parse_args()
+    main(args.foundation_model, args.donor_model, args.output_path)