rahul7star commited on
Commit
fc6bdf0
·
verified ·
1 Parent(s): 094cad2

Migrated from GitHub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +9 -0
  2. LICENSE.txt +201 -0
  3. ORIGINAL_README.md +382 -0
  4. app.py +819 -0
  5. assets/InfiniteTalk_paper.pdf +3 -0
  6. assets/logo.jpg +0 -0
  7. assets/logo2.jpg +3 -0
  8. assets/pipeline.png +3 -0
  9. examples/multi/1-man.WAV +3 -0
  10. examples/multi/1-woman.WAV +3 -0
  11. examples/multi/ref_img.png +3 -0
  12. examples/multi_example_image.json +9 -0
  13. examples/single/1.wav +3 -0
  14. examples/single/ref_image.png +3 -0
  15. examples/single/ref_video.mp4 +3 -0
  16. examples/single_example_image.json +7 -0
  17. examples/single_example_video.json +7 -0
  18. generate_infinitetalk.py +657 -0
  19. kokoro/__init__.py +23 -0
  20. kokoro/__main__.py +148 -0
  21. kokoro/custom_stft.py +197 -0
  22. kokoro/istftnet.py +421 -0
  23. kokoro/model.py +155 -0
  24. kokoro/modules.py +183 -0
  25. kokoro/pipeline.py +445 -0
  26. requirements.txt +20 -0
  27. src/audio_analysis/torch_utils.py +20 -0
  28. src/audio_analysis/wav2vec2.py +125 -0
  29. src/utils.py +60 -0
  30. src/vram_management/__init__.py +1 -0
  31. src/vram_management/layers.py +243 -0
  32. wan/__init__.py +6 -0
  33. wan/configs/__init__.py +58 -0
  34. wan/configs/shared_config.py +19 -0
  35. wan/configs/wan_i2v_14B.py +24 -0
  36. wan/configs/wan_multitalk_14B.py +36 -0
  37. wan/configs/wan_t2v_14B.py +29 -0
  38. wan/configs/wan_t2v_1_3B.py +29 -0
  39. wan/distributed/__init__.py +0 -0
  40. wan/distributed/fsdp.py +43 -0
  41. wan/distributed/xdit_context_parallel.py +550 -0
  42. wan/first_last_frame2video.py +377 -0
  43. wan/image2video.py +350 -0
  44. wan/modules/__init__.py +18 -0
  45. wan/modules/attention.py +393 -0
  46. wan/modules/clip.py +542 -0
  47. wan/modules/model.py +631 -0
  48. wan/modules/multitalk_model.py +824 -0
  49. wan/modules/t5.py +535 -0
  50. wan/modules/tokenizers.py +82 -0
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/InfiniteTalk_paper.pdf filter=lfs diff=lfs merge=lfs -text
37
+ assets/logo2.jpg filter=lfs diff=lfs merge=lfs -text
38
+ assets/pipeline.png filter=lfs diff=lfs merge=lfs -text
39
+ examples/multi/1-man.WAV filter=lfs diff=lfs merge=lfs -text
40
+ examples/multi/1-woman.WAV filter=lfs diff=lfs merge=lfs -text
41
+ examples/multi/ref_img.png filter=lfs diff=lfs merge=lfs -text
42
+ examples/single/1.wav filter=lfs diff=lfs merge=lfs -text
43
+ examples/single/ref_image.png filter=lfs diff=lfs merge=lfs -text
44
+ examples/single/ref_video.mp4 filter=lfs diff=lfs merge=lfs -text
LICENSE.txt ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
ORIGINAL_README.md ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <p align="center">
4
+ <img src="assets/logo2.jpg" alt="InfinteTalk" width="440"/>
5
+ </p>
6
+
7
+ <h1>InfiniteTalk: Audio-driven Video Generation for Sparse-Frame Video Dubbing</h1>
8
+
9
+
10
+ [Shaoshu Yang*](https://scholar.google.com/citations?user=JrdZbTsAAAAJ&hl=en) · [Zhe Kong*](https://scholar.google.com/citations?user=4X3yLwsAAAAJ&hl=zh-CN) · [Feng Gao*](https://scholar.google.com/citations?user=lFkCeoYAAAAJ) · [Meng Cheng*]() · [Xiangyu Liu*]() · [Yong Zhang](https://yzhang2016.github.io/)<sup>&#9993;</sup> · [Zhuoliang Kang](https://scholar.google.com/citations?user=W1ZXjMkAAAAJ&hl=en)
11
+
12
+ [Wenhan Luo](https://whluo.github.io/) · [Xunliang Cai](https://openreview.net/profile?id=~Xunliang_Cai1) · [Ran He](https://scholar.google.com/citations?user=ayrg9AUAAAAJ&hl=en)· [Xiaoming Wei](https://scholar.google.com/citations?user=JXV5yrZxj5MC&hl=zh-CN)
13
+
14
+ <sup>*</sup>Equal Contribution
15
+ <sup>&#9993;</sup>Corresponding Authors
16
+
17
+ <a href='https://meigen-ai.github.io/InfiniteTalk/'><img src='https://img.shields.io/badge/Project-Page-green'></a>
18
+ <a href='https://arxiv.org/abs/2508.14033'><img src='https://img.shields.io/badge/Technique-Report-red'></a>
19
+ <a href='https://huggingface.co/MeiGen-AI/InfiniteTalk'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
20
+ </div>
21
+
22
+ > **TL; DR:** InfiniteTalk is an unlimited-length talking video generation​​ model that supports both audio-driven video-to-video and image-to-video generation
23
+
24
+ <p align="center">
25
+ <img src="assets/pipeline.png">
26
+ </p>
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+ ## 🔥 Latest News
35
+
36
+ * August 19, 2025: We release the [Technique-Report](https://arxiv.org/abs/2508.14033) , weights, and code of **InfiniteTalk**. The Gradio and the [ComfyUI](https://github.com/MeiGen-AI/InfiniteTalk/tree/comfyui) branch have been released.
37
+ * August 19, 2025: We release the [project page](https://meigen-ai.github.io/InfiniteTalk/) of **InfiniteTalk**
38
+
39
+
40
+ ## ✨ Key Features
41
+ We propose **InfiniteTalk**​​, a novel sparse-frame video dubbing framework. Given an input video and audio track, InfiniteTalk synthesizes a new video with ​​accurate lip synchronization​​ while ​​simultaneously aligning head movements, body posture, and facial expressions​​ with the audio. Unlike traditional dubbing methods that focus solely on lips, InfiniteTalk enables ​​infinite-length video generation​​ with accurate lip synchronization and consistent identity preservation. Beside, InfiniteTalk can also be used as an image-audio-to-video model with an image and an audio as input.
42
+ - 💬 ​​Sparse-frame Video Dubbing​​ – Synchronizes not only lips, but aslo head, body, and expressions
43
+ - ⏱️ ​​Infinite-Length Generation​​ – Supports unlimited video duration
44
+ - ✨ ​​Stability​​ – Reduces hand/body distortions compared to MultiTalk
45
+ - 🚀 ​​Lip Accuracy​​ – Achieves superior lip synchronization to MultiTalk
46
+
47
+
48
+
49
+ ## 🌐 Community Works
50
+ -
51
+
52
+
53
+ ## 📑 Todo List
54
+
55
+ - [x] Release the technical report
56
+ - [x] Inference
57
+ - [x] Checkpoints
58
+ - [x] Multi-GPU Inference
59
+ - [ ] Inference acceleration
60
+ - [x] TeaCache
61
+ - [x] int8 quantization
62
+ - [ ] LCM distillation
63
+ - [ ] Sparse Attention
64
+ - [x] Run with very low VRAM
65
+ - [x] Gradio demo
66
+ - [x] ComfyUI
67
+
68
+ ## Video Demos
69
+
70
+
71
+ ### Video-to-video (HQ videos can be found on [Google Drive](https://drive.google.com/drive/folders/1BNrH6GJZ2Wt5gBuNLmfXZ6kpqb9xFPjU?usp=sharing) )
72
+
73
+
74
+ <table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
75
+ <tr>
76
+ <td>
77
+ <video src="https://github.com/user-attachments/assets/04f15986-8de7-4bb4-8cde-7f7f38244f9f" width="320" controls loop></video>
78
+ </td>
79
+ <td>
80
+ <video src="https://github.com/user-attachments/assets/1500f72e-a096-42e5-8b44-f887fa8ae7cb" width="320" controls loop></video>
81
+ </td>
82
+ <td>
83
+ <video src="https://github.com/user-attachments/assets/28f484c2-87dc-4828-a9e7-cb963da92d14" width="320" controls loop></video>
84
+ </td>
85
+ <td>
86
+ <video src="https://github.com/user-attachments/assets/665fabe4-3e24-4008-a0a2-a66e2e57c38b" width="320" controls loop></video>
87
+ </td>
88
+ </tr>
89
+ </table>
90
+
91
+ ### Image-to-video
92
+
93
+ <table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
94
+ <tr>
95
+ <td>
96
+ <video src="https://github.com/user-attachments/assets/7e4a4dad-9666-4896-8684-2acb36aead59" width="320" controls loop></video>
97
+ </td>
98
+ <td>
99
+ <video src="https://github.com/user-attachments/assets/bd6da665-f34d-4634-ae94-b4978f92ad3a" width="320" controls loop></video>
100
+ </td>
101
+ <td>
102
+ <video src="https://github.com/user-attachments/assets/510e2648-82db-4648-aaf3-6542303dbe22" width="320" controls loop></video>
103
+ </td>
104
+ <td>
105
+ <video src="https://github.com/user-attachments/assets/27bb087b-866a-4300-8a03-3bbb4ce3ddf9" width="320" controls loop></video>
106
+ </td>
107
+
108
+ </tr>
109
+ <tr>
110
+ <td>
111
+ <video src="https://github.com/user-attachments/assets/3263c5e1-9f98-4b9b-8688-b3e497460a76" width="320" controls loop></video>
112
+ </td>
113
+ <td>
114
+ <video src="https://github.com/user-attachments/assets/5ff3607f-90ec-4eee-b964-9d5ee3028005" width="320" controls loop></video>
115
+ </td>
116
+ <td>
117
+ <video src="https://github.com/user-attachments/assets/e504417b-c8c7-4cf0-9afa-da0f3cbf3726" width="320" controls loop></video>
118
+ </td>
119
+ <td>
120
+ <video src="https://github.com/user-attachments/assets/56aac91e-c51f-4d44-b80d-7d115e94ead7" width="320" controls loop></video>
121
+ </td>
122
+
123
+ </tr>
124
+ </table>
125
+
126
+ ## Quick Start
127
+
128
+ ### 🛠️Installation
129
+
130
+ #### 1. Create a conda environment and install pytorch, xformers
131
+ ```
132
+ conda create -n multitalk python=3.10
133
+ conda activate multitalk
134
+ pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
135
+ pip install -U xformers==0.0.28 --index-url https://download.pytorch.org/whl/cu121
136
+ ```
137
+ #### 2. Flash-attn installation:
138
+ ```
139
+ pip install misaki[en]
140
+ pip install ninja
141
+ pip install psutil
142
+ pip install packaging
143
+ pip install flash_attn==2.7.4.post1
144
+ ```
145
+
146
+ #### 3. Other dependencies
147
+ ```
148
+ pip install -r requirements.txt
149
+ conda install -c conda-forge librosa
150
+ ```
151
+
152
+ #### 4. FFmeg installation
153
+ ```
154
+ conda install -c conda-forge ffmpeg
155
+ ```
156
+ or
157
+ ```
158
+ sudo yum install ffmpeg ffmpeg-devel
159
+ ```
160
+
161
+ ### 🧱Model Preparation
162
+
163
+ #### 1. Model Download
164
+
165
+ | Models | Download Link | Notes |
166
+ | --------------|-------------------------------------------------------------------------------|-------------------------------|
167
+ | Wan2.1-I2V-14B-480P | 🤗 [Huggingface](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P) | Base model
168
+ | chinese-wav2vec2-base | 🤗 [Huggingface](https://huggingface.co/TencentGameMate/chinese-wav2vec2-base) | Audio encoder
169
+ | MeiGen-InfiniteTalk | 🤗 [Huggingface](https://huggingface.co/MeiGen-AI/InfiniteTalk) | Our audio condition weights
170
+
171
+ Download models using huggingface-cli:
172
+ ``` sh
173
+ huggingface-cli download Wan-AI/Wan2.1-I2V-14B-480P --local-dir ./weights/Wan2.1-I2V-14B-480P
174
+ huggingface-cli download TencentGameMate/chinese-wav2vec2-base --local-dir ./weights/chinese-wav2vec2-base
175
+ huggingface-cli download TencentGameMate/chinese-wav2vec2-base model.safetensors --revision refs/pr/1 --local-dir ./weights/chinese-wav2vec2-base
176
+ huggingface-cli download MeiGen-AI/InfiniteTalk --local-dir ./weights/InfiniteTalk
177
+
178
+ ```
179
+
180
+ ### 🔑 Quick Inference
181
+
182
+ Our model is compatible with both 480P and 720P resolutions.
183
+ > Some tips
184
+ > - Lip synchronization accuracy:​​ Audio CFG works optimally between 3–5. Increase the audio CFG value for better synchronization.
185
+ > - FusionX: While it enables faster inference and higher quality, FusionX LoRA exacerbates color shift over 1 minute and reduces ID preservation in videos.
186
+ > - V2V generation: Enables unlimited length generation. The model mimics the original video's camera movement, though not identically. Using SDEdit improves camera movement accuracy significantly but introduces color shift and is best suited for short clips. Improvements for long video camera control are planned.
187
+ > - I2V generation: Generates good results from a single image for up to 1 minute. Beyond 1 minute, color shifts become more pronounced. One trick for the high-quailty generation beyond 1 min is to copy the image to a video by translating or zooming in the image.
188
+
189
+
190
+ #### Usage of InfiniteTalk
191
+ ```
192
+ --mode streaming: long video generation.
193
+ --mode clip: generate short video with one chunk.
194
+ --use_teacache: run with TeaCache.
195
+ --size infinitetalk-480: generate 480P video.
196
+ --size infinitetalk-720: generate 720P video.
197
+ --use_apg: run with APG.
198
+ --teacache_thresh: A coefficient used for TeaCache acceleration
199
+ —-sample_text_guide_scale: When not using LoRA, the optimal value is 5. After applying LoRA, the recommended value is 1.
200
+ —-sample_audio_guide_scale: When not using LoRA, the optimal value is 4. After applying LoRA, the recommended value is 2.
201
+ ```
202
+
203
+ #### 1. Inference
204
+
205
+ ##### 1) Run with single GPU
206
+
207
+
208
+ ```
209
+ python generate_infinitetalk.py \
210
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
211
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
212
+ --infinitetalk_dir weights/InfiniteTalk/single/infinitetalk.safetensors \
213
+ --input_json examples/single_example_image.json \
214
+ --size infinitetalk-480 \
215
+ --sample_steps 40 \
216
+ --mode streaming \
217
+ --motion_frame 9 \
218
+ --save_file infinitetalk_res
219
+
220
+ ```
221
+
222
+ ##### 2) Run with 720P
223
+
224
+ If you want run with 720P, set `--size infinitetalk-720`:
225
+
226
+ ```
227
+ python generate_infinitetalk.py \
228
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
229
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
230
+ --infinitetalk_dir weights/InfiniteTalk/single/infinitetalk.safetensors \
231
+ --input_json examples/single_example_image.json \
232
+ --size infinitetalk-720 \
233
+ --sample_steps 40 \
234
+ --mode streaming \
235
+ --motion_frame 9 \
236
+ --save_file infinitetalk_res_720p
237
+
238
+ ```
239
+
240
+ ##### 3) Run with very low VRAM
241
+
242
+ If you want run with very low VRAM, set `--num_persistent_param_in_dit 0`:
243
+
244
+
245
+ ```
246
+ python generate_infinitetalk.py \
247
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
248
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
249
+ --infinitetalk_dir weights/InfiniteTalk/single/infinitetalk.safetensors \
250
+ --input_json examples/single_example_image.json \
251
+ --size infinitetalk-480 \
252
+ --sample_steps 40 \
253
+ --num_persistent_param_in_dit 0 \
254
+ --mode streaming \
255
+ --motion_frame 9 \
256
+ --save_file infinitetalk_res_lowvram
257
+ ```
258
+
259
+ ##### 4) Multi-GPU inference
260
+
261
+ ```
262
+ GPU_NUM=8
263
+ torchrun --nproc_per_node=$GPU_NUM --standalone generate_infinitetalk.py \
264
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
265
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
266
+ --infinitetalk_dir weights/InfiniteTalk/single/infinitetalk.safetensors \
267
+ --dit_fsdp --t5_fsdp \
268
+ --ulysses_size=$GPU_NUM \
269
+ --input_json examples/single_example_image.json \
270
+ --size infinitetalk-480 \
271
+ --sample_steps 40 \
272
+ --mode streaming \
273
+ --motion_frame 9 \
274
+ --save_file infinitetalk_res_multigpu
275
+ ```
276
+
277
+ ##### 5) Multi-Person animation
278
+
279
+ ```
280
+ python generate_infinitetalk.py \
281
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
282
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
283
+ --infinitetalk_dir weights/InfiniteTalk/multi/infinitetalk.safetensors \
284
+ --input_json examples/multi_example_image.json \
285
+ --size infinitetalk-480 \
286
+ --sample_steps 40 \
287
+ --num_persistent_param_in_dit 0 \
288
+ --mode streaming \
289
+ --motion_frame 9 \
290
+ --save_file infinitetalk_res_multiperson
291
+ ```
292
+
293
+
294
+ #### 2. Run with FusioniX or Lightx2v(Require only 4~8 steps)
295
+
296
+ [FusioniX](https://huggingface.co/vrgamedevgirl84/Wan14BT2VFusioniX/blob/main/FusionX_LoRa/Wan2.1_I2V_14B_FusionX_LoRA.safetensors) require 8 steps and [lightx2v](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors) requires only 4 steps.
297
+
298
+ ```
299
+ python generate_infinitetalk.py \
300
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
301
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
302
+ --infinitetalk_dir weights/InfiniteTalk/single/infinitetalk.safetensors \
303
+ --lora_dir weights/Wan2.1_I2V_14B_FusionX_LoRA.safetensors \
304
+ --input_json examples/single_example_image.json \
305
+ --lora_scale 1.0 \
306
+ --size infinitetalk-480 \
307
+ --sample_text_guide_scale 1.0 \
308
+ --sample_audio_guide_scale 2.0 \
309
+ --sample_steps 8 \
310
+ --mode streaming \
311
+ --motion_frame 9 \
312
+ --sample_shift 2 \
313
+ --num_persistent_param_in_dit 0 \
314
+ --save_file infinitetalk_res_lora
315
+ ```
316
+
317
+
318
+
319
+ #### 3. Run with the quantization model (Only support run with single gpu)
320
+
321
+ ```
322
+ python generate_infinitetalk.py \
323
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
324
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
325
+ --infinitetalk_dir weights/InfiniteTalk/single/infinitetalk.safetensors \
326
+ --input_json examples/single_example_image.json \
327
+ --size infinitetalk-480 \
328
+ --sample_steps 40 \
329
+ --mode streaming \
330
+ --quant fp8 \
331
+ --quant_dir weights/InfiniteTalk/quant_models/infinitetalk_single_fp8.safetensors \
332
+ --motion_frame 9 \
333
+ --num_persistent_param_in_dit 0 \
334
+ --save_file infinitetalk_res_quant
335
+ ```
336
+
337
+
338
+ #### 4. Run with Gradio
339
+
340
+
341
+
342
+ ```
343
+ python app.py \
344
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
345
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
346
+ --infinitetalk_dir weights/InfiniteTalk/single/infinitetalk.safetensors \
347
+ --num_persistent_param_in_dit 0 \
348
+ --motion_frame 9
349
+ ```
350
+ or
351
+ ```
352
+ python app.py \
353
+ --ckpt_dir weights/Wan2.1-I2V-14B-480P \
354
+ --wav2vec_dir 'weights/chinese-wav2vec2-base' \
355
+ --infinitetalk_dir weights/InfiniteTalk/multi/infinitetalk.safetensors \
356
+ --num_persistent_param_in_dit 0 \
357
+ --motion_frame 9
358
+ ```
359
+
360
+
361
+ ## 📚 Citation
362
+
363
+ If you find our work useful in your research, please consider citing:
364
+
365
+ ```
366
+ @misc{yang2025infinitetalkaudiodrivenvideogeneration,
367
+ title={InfiniteTalk: Audio-driven Video Generation for Sparse-Frame Video Dubbing},
368
+ author={Shaoshu Yang and Zhe Kong and Feng Gao and Meng Cheng and Xiangyu Liu and Yong Zhang and Zhuoliang Kang and Wenhan Luo and Xunliang Cai and Ran He and Xiaoming Wei},
369
+ year={2025},
370
+ eprint={2508.14033},
371
+ archivePrefix={arXiv},
372
+ primaryClass={cs.CV},
373
+ url={https://arxiv.org/abs/2508.14033},
374
+ }
375
+ ```
376
+
377
+ ## 📜 License
378
+ The models in this repository are licensed under the Apache 2.0 License. We claim no rights over the your generated contents,
379
+ granting you the freedom to use them while ensuring that your usage complies with the provisions of this license.
380
+ You are fully accountable for your use of the models, which must not involve sharing any content that violates applicable laws,
381
+ causes harm to individuals or groups, disseminates personal information intended for harm, spreads misinformation, or targets vulnerable populations.
382
+
app.py ADDED
@@ -0,0 +1,819 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import argparse
3
+ import logging
4
+ import os
5
+ os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
6
+ import sys
7
+ import json
8
+ import warnings
9
+ from datetime import datetime
10
+
11
+ import gradio as gr
12
+ warnings.filterwarnings('ignore')
13
+
14
+ import random
15
+
16
+ import torch
17
+ import torch.distributed as dist
18
+ from PIL import Image
19
+ import subprocess
20
+
21
+ import wan
22
+ from wan.configs import SIZE_CONFIGS, SUPPORTED_SIZES, WAN_CONFIGS
23
+ from wan.utils.utils import cache_image, cache_video, str2bool
24
+ from wan.utils.multitalk_utils import save_video_ffmpeg
25
+ from kokoro import KPipeline
26
+ from transformers import Wav2Vec2FeatureExtractor
27
+ from src.audio_analysis.wav2vec2 import Wav2Vec2Model
28
+
29
+ import librosa
30
+ import pyloudnorm as pyln
31
+ import numpy as np
32
+ from einops import rearrange
33
+ import soundfile as sf
34
+ import re
35
+
36
+ def _validate_args(args):
37
+ # Basic check
38
+ assert args.ckpt_dir is not None, "Please specify the checkpoint directory."
39
+ assert args.task in WAN_CONFIGS, f"Unsupport task: {args.task}"
40
+
41
+ # The default sampling steps are 40 for image-to-video tasks and 50 for text-to-video tasks.
42
+ if args.sample_steps is None:
43
+ args.sample_steps = 40
44
+
45
+ if args.sample_shift is None:
46
+ if args.size == 'infinitetalk-480':
47
+ args.sample_shift = 7
48
+ elif args.size == 'infinitetalk-720':
49
+ args.sample_shift = 11
50
+ else:
51
+ raise NotImplementedError(f'Not supported size')
52
+
53
+ args.base_seed = args.base_seed if args.base_seed >= 0 else random.randint(
54
+ 0, 99999999)
55
+ # Size check
56
+ assert args.size in SUPPORTED_SIZES[
57
+ args.
58
+ task], f"Unsupport size {args.size} for task {args.task}, supported sizes are: {', '.join(SUPPORTED_SIZES[args.task])}"
59
+
60
+
61
+ def _parse_args():
62
+ parser = argparse.ArgumentParser(
63
+ description="Generate a image or video from a text prompt or image using Wan"
64
+ )
65
+ parser.add_argument(
66
+ "--task",
67
+ type=str,
68
+ default="infinitetalk-14B",
69
+ choices=list(WAN_CONFIGS.keys()),
70
+ help="The task to run.")
71
+ parser.add_argument(
72
+ "--size",
73
+ type=str,
74
+ default="infinitetalk-480",
75
+ choices=list(SIZE_CONFIGS.keys()),
76
+ help="The buckget size of the generated video. The aspect ratio of the output video will follow that of the input image."
77
+ )
78
+ parser.add_argument(
79
+ "--frame_num",
80
+ type=int,
81
+ default=81,
82
+ help="How many frames to be generated in one clip. The number should be 4n+1"
83
+ )
84
+ parser.add_argument(
85
+ "--ckpt_dir",
86
+ type=str,
87
+ default='./weights/Wan2.1-I2V-14B-480P',
88
+ help="The path to the Wan checkpoint directory.")
89
+ parser.add_argument(
90
+ "--quant_dir",
91
+ type=str,
92
+ default=None,
93
+ help="The path to the Wan quant checkpoint directory.")
94
+ parser.add_argument(
95
+ "--infinitetalk_dir",
96
+ type=str,
97
+ default='weights/InfiniteTalk/single/infinitetalk.safetensors',
98
+ help="The path to the InfiniteTalk checkpoint directory.")
99
+ parser.add_argument(
100
+ "--wav2vec_dir",
101
+ type=str,
102
+ default='./weights/chinese-wav2vec2-base',
103
+ help="The path to the wav2vec checkpoint directory.")
104
+ parser.add_argument(
105
+ "--dit_path",
106
+ type=str,
107
+ default=None,
108
+ help="The path to the Wan checkpoint directory.")
109
+ parser.add_argument(
110
+ "--lora_dir",
111
+ type=str,
112
+ nargs='+',
113
+ default=None,
114
+ help="The path to the LoRA checkpoint directory.")
115
+ parser.add_argument(
116
+ "--lora_scale",
117
+ type=float,
118
+ nargs='+',
119
+ default=[1.2],
120
+ help="Controls how much to influence the outputs with the LoRA parameters. Accepts multiple float values."
121
+ )
122
+ parser.add_argument(
123
+ "--offload_model",
124
+ type=str2bool,
125
+ default=None,
126
+ help="Whether to offload the model to CPU after each model forward, reducing GPU memory usage."
127
+ )
128
+ parser.add_argument(
129
+ "--ulysses_size",
130
+ type=int,
131
+ default=1,
132
+ help="The size of the ulysses parallelism in DiT.")
133
+ parser.add_argument(
134
+ "--ring_size",
135
+ type=int,
136
+ default=1,
137
+ help="The size of the ring attention parallelism in DiT.")
138
+ parser.add_argument(
139
+ "--t5_fsdp",
140
+ action="store_true",
141
+ default=False,
142
+ help="Whether to use FSDP for T5.")
143
+ parser.add_argument(
144
+ "--t5_cpu",
145
+ action="store_true",
146
+ default=False,
147
+ help="Whether to place T5 model on CPU.")
148
+ parser.add_argument(
149
+ "--dit_fsdp",
150
+ action="store_true",
151
+ default=False,
152
+ help="Whether to use FSDP for DiT.")
153
+ parser.add_argument(
154
+ "--save_file",
155
+ type=str,
156
+ default=None,
157
+ help="The file to save the generated image or video to.")
158
+ parser.add_argument(
159
+ "--audio_save_dir",
160
+ type=str,
161
+ default='save_audio/gradio',
162
+ help="The path to save the audio embedding.")
163
+ parser.add_argument(
164
+ "--base_seed",
165
+ type=int,
166
+ default=42,
167
+ help="The seed to use for generating the image or video.")
168
+ parser.add_argument(
169
+ "--input_json",
170
+ type=str,
171
+ default='examples.json',
172
+ help="[meta file] The condition path to generate the video.")
173
+ parser.add_argument(
174
+ "--motion_frame",
175
+ type=int,
176
+ default=9,
177
+ help="Driven frame length used in the mode of long video genration.")
178
+ parser.add_argument(
179
+ "--mode",
180
+ type=str,
181
+ default="streaming",
182
+ choices=['clip', 'streaming'],
183
+ help="clip: generate one video chunk, streaming: long video generation")
184
+ parser.add_argument(
185
+ "--sample_steps", type=int, default=None, help="The sampling steps.")
186
+ parser.add_argument(
187
+ "--sample_shift",
188
+ type=float,
189
+ default=None,
190
+ help="Sampling shift factor for flow matching schedulers.")
191
+ parser.add_argument(
192
+ "--sample_text_guide_scale",
193
+ type=float,
194
+ default=5.0,
195
+ help="Classifier free guidance scale for text control.")
196
+ parser.add_argument(
197
+ "--sample_audio_guide_scale",
198
+ type=float,
199
+ default=4.0,
200
+ help="Classifier free guidance scale for audio control.")
201
+ parser.add_argument(
202
+ "--num_persistent_param_in_dit",
203
+ type=int,
204
+ default=None,
205
+ required=False,
206
+ help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required",
207
+ )
208
+ parser.add_argument(
209
+ "--use_teacache",
210
+ action="store_true",
211
+ default=False,
212
+ help="Enable teacache for video generation."
213
+ )
214
+ parser.add_argument(
215
+ "--teacache_thresh",
216
+ type=float,
217
+ default=0.2,
218
+ help="Threshold for teacache."
219
+ )
220
+ parser.add_argument(
221
+ "--use_apg",
222
+ action="store_true",
223
+ default=False,
224
+ help="Enable adaptive projected guidance for video generation (APG)."
225
+ )
226
+ parser.add_argument(
227
+ "--apg_momentum",
228
+ type=float,
229
+ default=-0.75,
230
+ help="Momentum used in adaptive projected guidance (APG)."
231
+ )
232
+ parser.add_argument(
233
+ "--apg_norm_threshold",
234
+ type=float,
235
+ default=55,
236
+ help="Norm threshold used in adaptive projected guidance (APG)."
237
+ )
238
+ parser.add_argument(
239
+ "--color_correction_strength",
240
+ type=float,
241
+ default=1.0,
242
+ help="strength for color correction [0.0 -- 1.0]."
243
+ )
244
+
245
+ parser.add_argument(
246
+ "--quant",
247
+ type=str,
248
+ default=None,
249
+ help="Quantization type, must be 'int8' or 'fp8'."
250
+ )
251
+ args = parser.parse_args()
252
+ _validate_args(args)
253
+ return args
254
+
255
+
256
+ def custom_init(device, wav2vec):
257
+ audio_encoder = Wav2Vec2Model.from_pretrained(wav2vec, local_files_only=True).to(device)
258
+ audio_encoder.feature_extractor._freeze_parameters()
259
+ wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec, local_files_only=True)
260
+ return wav2vec_feature_extractor, audio_encoder
261
+
262
+ def loudness_norm(audio_array, sr=16000, lufs=-23):
263
+ meter = pyln.Meter(sr)
264
+ loudness = meter.integrated_loudness(audio_array)
265
+ if abs(loudness) > 100:
266
+ return audio_array
267
+ normalized_audio = pyln.normalize.loudness(audio_array, loudness, lufs)
268
+ return normalized_audio
269
+
270
+ def audio_prepare_multi(left_path, right_path, audio_type, sample_rate=16000):
271
+ if not (left_path=='None' or right_path=='None'):
272
+ human_speech_array1 = audio_prepare_single(left_path)
273
+ human_speech_array2 = audio_prepare_single(right_path)
274
+ elif left_path=='None':
275
+ human_speech_array2 = audio_prepare_single(right_path)
276
+ human_speech_array1 = np.zeros(human_speech_array2.shape[0])
277
+ elif right_path=='None':
278
+ human_speech_array1 = audio_prepare_single(left_path)
279
+ human_speech_array2 = np.zeros(human_speech_array1.shape[0])
280
+
281
+ if audio_type=='para':
282
+ new_human_speech1 = human_speech_array1
283
+ new_human_speech2 = human_speech_array2
284
+ elif audio_type=='add':
285
+ new_human_speech1 = np.concatenate([human_speech_array1[: human_speech_array1.shape[0]], np.zeros(human_speech_array2.shape[0])])
286
+ new_human_speech2 = np.concatenate([np.zeros(human_speech_array1.shape[0]), human_speech_array2[:human_speech_array2.shape[0]]])
287
+ sum_human_speechs = new_human_speech1 + new_human_speech2
288
+ return new_human_speech1, new_human_speech2, sum_human_speechs
289
+
290
+ def _init_logging(rank):
291
+ # logging
292
+ if rank == 0:
293
+ # set format
294
+ logging.basicConfig(
295
+ level=logging.INFO,
296
+ format="[%(asctime)s] %(levelname)s: %(message)s",
297
+ handlers=[logging.StreamHandler(stream=sys.stdout)])
298
+ else:
299
+ logging.basicConfig(level=logging.ERROR)
300
+
301
+ def get_embedding(speech_array, wav2vec_feature_extractor, audio_encoder, sr=16000, device='cpu'):
302
+ audio_duration = len(speech_array) / sr
303
+ video_length = audio_duration * 25 # Assume the video fps is 25
304
+
305
+ # wav2vec_feature_extractor
306
+ audio_feature = np.squeeze(
307
+ wav2vec_feature_extractor(speech_array, sampling_rate=sr).input_values
308
+ )
309
+ audio_feature = torch.from_numpy(audio_feature).float().to(device=device)
310
+ audio_feature = audio_feature.unsqueeze(0)
311
+
312
+ # audio encoder
313
+ with torch.no_grad():
314
+ embeddings = audio_encoder(audio_feature, seq_len=int(video_length), output_hidden_states=True)
315
+
316
+ if len(embeddings) == 0:
317
+ print("Fail to extract audio embedding")
318
+ return None
319
+
320
+ audio_emb = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
321
+ audio_emb = rearrange(audio_emb, "b s d -> s b d")
322
+
323
+ audio_emb = audio_emb.cpu().detach()
324
+ return audio_emb
325
+
326
+ def extract_audio_from_video(filename, sample_rate):
327
+ raw_audio_path = filename.split('/')[-1].split('.')[0]+'.wav'
328
+ ffmpeg_command = [
329
+ "ffmpeg",
330
+ "-y",
331
+ "-i",
332
+ str(filename),
333
+ "-vn",
334
+ "-acodec",
335
+ "pcm_s16le",
336
+ "-ar",
337
+ "16000",
338
+ "-ac",
339
+ "2",
340
+ str(raw_audio_path),
341
+ ]
342
+ subprocess.run(ffmpeg_command, check=True)
343
+ human_speech_array, sr = librosa.load(raw_audio_path, sr=sample_rate)
344
+ human_speech_array = loudness_norm(human_speech_array, sr)
345
+ os.remove(raw_audio_path)
346
+
347
+ return human_speech_array
348
+
349
+ def audio_prepare_single(audio_path, sample_rate=16000):
350
+ ext = os.path.splitext(audio_path)[1].lower()
351
+ if ext in ['.mp4', '.mov', '.avi', '.mkv']:
352
+ human_speech_array = extract_audio_from_video(audio_path, sample_rate)
353
+ return human_speech_array
354
+ else:
355
+ human_speech_array, sr = librosa.load(audio_path, sr=sample_rate)
356
+ human_speech_array = loudness_norm(human_speech_array, sr)
357
+ return human_speech_array
358
+
359
+ def process_tts_single(text, save_dir, voice1):
360
+ s1_sentences = []
361
+
362
+ pipeline = KPipeline(lang_code='a', repo_id='weights/Kokoro-82M')
363
+
364
+ voice_tensor = torch.load(voice1, weights_only=True)
365
+ generator = pipeline(
366
+ text, voice=voice_tensor, # <= change voice here
367
+ speed=1, split_pattern=r'\n+'
368
+ )
369
+ audios = []
370
+ for i, (gs, ps, audio) in enumerate(generator):
371
+ audios.append(audio)
372
+ audios = torch.concat(audios, dim=0)
373
+ s1_sentences.append(audios)
374
+ s1_sentences = torch.concat(s1_sentences, dim=0)
375
+ save_path1 =f'{save_dir}/s1.wav'
376
+ sf.write(save_path1, s1_sentences, 24000) # save each audio file
377
+ s1, _ = librosa.load(save_path1, sr=16000)
378
+ return s1, save_path1
379
+
380
+
381
+
382
+ def process_tts_multi(text, save_dir, voice1, voice2):
383
+ pattern = r'\(s(\d+)\)\s*(.*?)(?=\s*\(s\d+\)|$)'
384
+ matches = re.findall(pattern, text, re.DOTALL)
385
+
386
+ s1_sentences = []
387
+ s2_sentences = []
388
+
389
+ pipeline = KPipeline(lang_code='a', repo_id='weights/Kokoro-82M')
390
+ for idx, (speaker, content) in enumerate(matches):
391
+ if speaker == '1':
392
+ voice_tensor = torch.load(voice1, weights_only=True)
393
+ generator = pipeline(
394
+ content, voice=voice_tensor, # <= change voice here
395
+ speed=1, split_pattern=r'\n+'
396
+ )
397
+ audios = []
398
+ for i, (gs, ps, audio) in enumerate(generator):
399
+ audios.append(audio)
400
+ audios = torch.concat(audios, dim=0)
401
+ s1_sentences.append(audios)
402
+ s2_sentences.append(torch.zeros_like(audios))
403
+ elif speaker == '2':
404
+ voice_tensor = torch.load(voice2, weights_only=True)
405
+ generator = pipeline(
406
+ content, voice=voice_tensor, # <= change voice here
407
+ speed=1, split_pattern=r'\n+'
408
+ )
409
+ audios = []
410
+ for i, (gs, ps, audio) in enumerate(generator):
411
+ audios.append(audio)
412
+ audios = torch.concat(audios, dim=0)
413
+ s2_sentences.append(audios)
414
+ s1_sentences.append(torch.zeros_like(audios))
415
+
416
+ s1_sentences = torch.concat(s1_sentences, dim=0)
417
+ s2_sentences = torch.concat(s2_sentences, dim=0)
418
+ sum_sentences = s1_sentences + s2_sentences
419
+ save_path1 =f'{save_dir}/s1.wav'
420
+ save_path2 =f'{save_dir}/s2.wav'
421
+ save_path_sum = f'{save_dir}/sum.wav'
422
+ sf.write(save_path1, s1_sentences, 24000) # save each audio file
423
+ sf.write(save_path2, s2_sentences, 24000)
424
+ sf.write(save_path_sum, sum_sentences, 24000)
425
+
426
+ s1, _ = librosa.load(save_path1, sr=16000)
427
+ s2, _ = librosa.load(save_path2, sr=16000)
428
+ # sum, _ = librosa.load(save_path_sum, sr=16000)
429
+ return s1, s2, save_path_sum
430
+
431
+ def run_graio_demo(args):
432
+ rank = int(os.getenv("RANK", 0))
433
+ world_size = int(os.getenv("WORLD_SIZE", 1))
434
+ local_rank = int(os.getenv("LOCAL_RANK", 0))
435
+ device = local_rank
436
+ _init_logging(rank)
437
+
438
+ if args.offload_model is None:
439
+ args.offload_model = False if world_size > 1 else True
440
+ logging.info(
441
+ f"offload_model is not specified, set to {args.offload_model}.")
442
+ if world_size > 1:
443
+ torch.cuda.set_device(local_rank)
444
+ dist.init_process_group(
445
+ backend="nccl",
446
+ init_method="env://",
447
+ rank=rank,
448
+ world_size=world_size)
449
+ else:
450
+ assert not (
451
+ args.t5_fsdp or args.dit_fsdp
452
+ ), f"t5_fsdp and dit_fsdp are not supported in non-distributed environments."
453
+ assert not (
454
+ args.ulysses_size > 1 or args.ring_size > 1
455
+ ), f"context parallel are not supported in non-distributed environments."
456
+
457
+ if args.ulysses_size > 1 or args.ring_size > 1:
458
+ assert args.ulysses_size * args.ring_size == world_size, f"The number of ulysses_size and ring_size should be equal to the world size."
459
+ from xfuser.core.distributed import (
460
+ init_distributed_environment,
461
+ initialize_model_parallel,
462
+ )
463
+ init_distributed_environment(
464
+ rank=dist.get_rank(), world_size=dist.get_world_size())
465
+
466
+ initialize_model_parallel(
467
+ sequence_parallel_degree=dist.get_world_size(),
468
+ ring_degree=args.ring_size,
469
+ ulysses_degree=args.ulysses_size,
470
+ )
471
+
472
+
473
+ cfg = WAN_CONFIGS[args.task]
474
+ if args.ulysses_size > 1:
475
+ assert cfg.num_heads % args.ulysses_size == 0, f"`{cfg.num_heads=}` cannot be divided evenly by `{args.ulysses_size=}`."
476
+
477
+ logging.info(f"Generation job args: {args}")
478
+ logging.info(f"Generation model config: {cfg}")
479
+
480
+ if dist.is_initialized():
481
+ base_seed = [args.base_seed] if rank == 0 else [None]
482
+ dist.broadcast_object_list(base_seed, src=0)
483
+ args.base_seed = base_seed[0]
484
+
485
+ assert args.task == "infinitetalk-14B", 'You should choose multitalk in args.task.'
486
+
487
+
488
+
489
+ wav2vec_feature_extractor, audio_encoder= custom_init('cpu', args.wav2vec_dir)
490
+ os.makedirs(args.audio_save_dir,exist_ok=True)
491
+
492
+
493
+ logging.info("Creating MultiTalk pipeline.")
494
+ wan_i2v = wan.InfiniteTalkPipeline(
495
+ config=cfg,
496
+ checkpoint_dir=args.ckpt_dir,
497
+ quant_dir=args.quant_dir,
498
+ device_id=device,
499
+ rank=rank,
500
+ t5_fsdp=args.t5_fsdp,
501
+ dit_fsdp=args.dit_fsdp,
502
+ use_usp=(args.ulysses_size > 1 or args.ring_size > 1),
503
+ t5_cpu=args.t5_cpu,
504
+ lora_dir=args.lora_dir,
505
+ lora_scales=args.lora_scale,
506
+ quant=args.quant,
507
+ dit_path=args.dit_path,
508
+ infinitetalk_dir=args.infinitetalk_dir
509
+ )
510
+
511
+ if args.num_persistent_param_in_dit is not None:
512
+ wan_i2v.vram_management = True
513
+ wan_i2v.enable_vram_management(
514
+ num_persistent_param_in_dit=args.num_persistent_param_in_dit
515
+ )
516
+
517
+
518
+
519
+ def generate_video(img2vid_image, vid2vid_vid, task_mode, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2,
520
+ sd_steps, seed, text_guide_scale, audio_guide_scale, mode_selector, tts_text, resolution_select, human1_voice, human2_voice):
521
+ input_data = {}
522
+ input_data["prompt"] = img2vid_prompt
523
+ if task_mode=='VideoDubbing':
524
+ input_data["cond_video"] = vid2vid_vid
525
+ else:
526
+ input_data["cond_video"] = img2vid_image
527
+ person = {}
528
+ if mode_selector == "Single Person(Local File)":
529
+ person['person1'] = img2vid_audio_1
530
+ elif mode_selector == "Single Person(TTS)":
531
+ tts_audio = {}
532
+ tts_audio['text'] = tts_text
533
+ tts_audio['human1_voice'] = human1_voice
534
+ input_data["tts_audio"] = tts_audio
535
+ elif mode_selector == "Multi Person(Local File, audio add)":
536
+ person['person1'] = img2vid_audio_1
537
+ person['person2'] = img2vid_audio_2
538
+ input_data["audio_type"] = 'add'
539
+ elif mode_selector == "Multi Person(Local File, audio parallel)":
540
+ person['person1'] = img2vid_audio_1
541
+ person['person2'] = img2vid_audio_2
542
+ input_data["audio_type"] = 'para'
543
+ else:
544
+ tts_audio = {}
545
+ tts_audio['text'] = tts_text
546
+ tts_audio['human1_voice'] = human1_voice
547
+ tts_audio['human2_voice'] = human2_voice
548
+ input_data["tts_audio"] = tts_audio
549
+
550
+ input_data["cond_audio"] = person
551
+
552
+ if 'Local File' in mode_selector:
553
+ if len(input_data['cond_audio'])==2:
554
+ new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(input_data['cond_audio']['person1'], input_data['cond_audio']['person2'], input_data['audio_type'])
555
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
556
+ audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
557
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
558
+ emb2_path = os.path.join(args.audio_save_dir, '2.pt')
559
+ sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
560
+ sf.write(sum_audio, sum_human_speechs, 16000)
561
+ torch.save(audio_embedding_1, emb1_path)
562
+ torch.save(audio_embedding_2, emb2_path)
563
+ input_data['cond_audio']['person1'] = emb1_path
564
+ input_data['cond_audio']['person2'] = emb2_path
565
+ input_data['video_audio'] = sum_audio
566
+ elif len(input_data['cond_audio'])==1:
567
+ human_speech = audio_prepare_single(input_data['cond_audio']['person1'])
568
+ audio_embedding = get_embedding(human_speech, wav2vec_feature_extractor, audio_encoder)
569
+ emb_path = os.path.join(args.audio_save_dir, '1.pt')
570
+ sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
571
+ sf.write(sum_audio, human_speech, 16000)
572
+ torch.save(audio_embedding, emb_path)
573
+ input_data['cond_audio']['person1'] = emb_path
574
+ input_data['video_audio'] = sum_audio
575
+ elif 'TTS' in mode_selector:
576
+ if 'human2_voice' not in input_data['tts_audio'].keys():
577
+ new_human_speech1, sum_audio = process_tts_single(input_data['tts_audio']['text'], args.audio_save_dir, input_data['tts_audio']['human1_voice'])
578
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
579
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
580
+ torch.save(audio_embedding_1, emb1_path)
581
+ input_data['cond_audio']['person1'] = emb1_path
582
+ input_data['video_audio'] = sum_audio
583
+ else:
584
+ new_human_speech1, new_human_speech2, sum_audio = process_tts_multi(input_data['tts_audio']['text'], args.audio_save_dir, input_data['tts_audio']['human1_voice'], input_data['tts_audio']['human2_voice'])
585
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
586
+ audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
587
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
588
+ emb2_path = os.path.join(args.audio_save_dir, '2.pt')
589
+ torch.save(audio_embedding_1, emb1_path)
590
+ torch.save(audio_embedding_2, emb2_path)
591
+ input_data['cond_audio']['person1'] = emb1_path
592
+ input_data['cond_audio']['person2'] = emb2_path
593
+ input_data['video_audio'] = sum_audio
594
+
595
+
596
+ # if len(input_data['cond_audio'])==2:
597
+ # new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(input_data['cond_audio']['person1'], input_data['cond_audio']['person2'], input_data['audio_type'])
598
+ # audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
599
+ # audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
600
+ # emb1_path = os.path.join(args.audio_save_dir, '1.pt')
601
+ # emb2_path = os.path.join(args.audio_save_dir, '2.pt')
602
+ # sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
603
+ # sf.write(sum_audio, sum_human_speechs, 16000)
604
+ # torch.save(audio_embedding_1, emb1_path)
605
+ # torch.save(audio_embedding_2, emb2_path)
606
+ # input_data['cond_audio']['person1'] = emb1_path
607
+ # input_data['cond_audio']['person2'] = emb2_path
608
+ # input_data['video_audio'] = sum_audio
609
+ # elif len(input_data['cond_audio'])==1:
610
+ # human_speech = audio_prepare_single(input_data['cond_audio']['person1'])
611
+ # audio_embedding = get_embedding(human_speech, wav2vec_feature_extractor, audio_encoder)
612
+ # emb_path = os.path.join(args.audio_save_dir, '1.pt')
613
+ # sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
614
+ # sf.write(sum_audio, human_speech, 16000)
615
+ # torch.save(audio_embedding, emb_path)
616
+ # input_data['cond_audio']['person1'] = emb_path
617
+ # input_data['video_audio'] = sum_audio
618
+
619
+ logging.info("Generating video ...")
620
+ video = wan_i2v.generate_infinitetalk(
621
+ input_data,
622
+ size_buckget=resolution_select,
623
+ motion_frame=args.motion_frame,
624
+ frame_num=args.frame_num,
625
+ shift=args.sample_shift,
626
+ sampling_steps=sd_steps,
627
+ text_guide_scale=text_guide_scale,
628
+ audio_guide_scale=audio_guide_scale,
629
+ seed=seed,
630
+ n_prompt=n_prompt,
631
+ offload_model=args.offload_model,
632
+ max_frames_num=args.frame_num if args.mode == 'clip' else 1000,
633
+ color_correction_strength = args.color_correction_strength,
634
+ extra_args=args,
635
+ )
636
+
637
+
638
+ if args.save_file is None:
639
+ formatted_time = datetime.now().strftime("%Y%m%d_%H%M%S")
640
+ formatted_prompt = input_data['prompt'].replace(" ", "_").replace("/",
641
+ "_")[:50]
642
+ args.save_file = f"{args.task}_{args.size.replace('*','x') if sys.platform=='win32' else args.size}_{args.ulysses_size}_{args.ring_size}_{formatted_prompt}_{formatted_time}"
643
+
644
+ logging.info(f"Saving generated video to {args.save_file}.mp4")
645
+ save_video_ffmpeg(video, args.save_file, [input_data['video_audio']], high_quality_save=False)
646
+ logging.info("Finished.")
647
+
648
+ return args.save_file + '.mp4'
649
+
650
+ def toggle_audio_mode(mode):
651
+ if 'TTS' in mode:
652
+ return [
653
+ gr.Audio(visible=False, interactive=False),
654
+ gr.Audio(visible=False, interactive=False),
655
+ gr.Textbox(visible=True, interactive=True)
656
+ ]
657
+ elif 'Single' in mode:
658
+ return [
659
+ gr.Audio(visible=True, interactive=True),
660
+ gr.Audio(visible=False, interactive=False),
661
+ gr.Textbox(visible=False, interactive=False)
662
+ ]
663
+ else:
664
+ return [
665
+ gr.Audio(visible=True, interactive=True),
666
+ gr.Audio(visible=True, interactive=True),
667
+ gr.Textbox(visible=False, interactive=False)
668
+ ]
669
+
670
+ def show_upload(mode):
671
+ if mode == "SingleImageDriven":
672
+ return gr.update(visible=True), gr.update(visible=False)
673
+ else:
674
+ return gr.update(visible=False), gr.update(visible=True)
675
+
676
+
677
+ with gr.Blocks() as demo:
678
+
679
+ gr.Markdown("""
680
+ <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
681
+ MeiGen-InfiniteTalk
682
+ </div>
683
+ <div style="text-align: center; font-size: 16px; font-weight: normal; margin-bottom: 20px;">
684
+ InfiniteTalk: Audio-driven Video Generation for Spare-Frame Video Dubbing.
685
+ </div>
686
+ <div style="display: flex; justify-content: center; gap: 10px; flex-wrap: wrap;">
687
+ <a href=''><img src='https://img.shields.io/badge/Project-Page-blue'></a>
688
+ <a href=''><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a>
689
+ <a href=''><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
690
+ </div>
691
+
692
+
693
+ """)
694
+
695
+ with gr.Row():
696
+ with gr.Column(scale=1):
697
+ task_mode = gr.Radio(
698
+ choices=["SingleImageDriven", "VideoDubbing"],
699
+ label="Choose SingleImageDriven task or VideoDubbing task",
700
+ value="VideoDubbing"
701
+ )
702
+ vid2vid_vid = gr.Video(
703
+ label="Upload Input Video",
704
+ visible=True)
705
+ img2vid_image = gr.Image(
706
+ type="filepath",
707
+ label="Upload Input Image",
708
+ elem_id="image_upload",
709
+ visible=False
710
+ )
711
+ img2vid_prompt = gr.Textbox(
712
+ label="Prompt",
713
+ placeholder="Describe the video you want to generate",
714
+ )
715
+ task_mode.change(
716
+ fn=show_upload,
717
+ inputs=task_mode,
718
+ outputs=[img2vid_image, vid2vid_vid]
719
+ )
720
+
721
+
722
+ with gr.Accordion("Audio Options", open=True):
723
+ mode_selector = gr.Radio(
724
+ choices=["Single Person(Local File)", "Single Person(TTS)", "Multi Person(Local File, audio add)", "Multi Person(Local File, audio parallel)", "Multi Person(TTS)"],
725
+ label="Select person and audio mode.",
726
+ value="Single Person(Local File)"
727
+ )
728
+ resolution_select = gr.Radio(
729
+ choices=["infinitetalk-480", "infinitetalk-720"],
730
+ label="Select resolution.",
731
+ value="infinitetalk-480"
732
+ )
733
+ img2vid_audio_1 = gr.Audio(label="Conditioning Audio for speaker 1", type="filepath", visible=True)
734
+ img2vid_audio_2 = gr.Audio(label="Conditioning Audio for speaker 2", type="filepath", visible=False)
735
+ tts_text = gr.Textbox(
736
+ label="Text for TTS",
737
+ placeholder="Refer to the format in the examples",
738
+ visible=False,
739
+ interactive=False
740
+ )
741
+ mode_selector.change(
742
+ fn=toggle_audio_mode,
743
+ inputs=mode_selector,
744
+ outputs=[img2vid_audio_1, img2vid_audio_2, tts_text]
745
+ )
746
+
747
+ with gr.Accordion("Advanced Options", open=False):
748
+ with gr.Row():
749
+ sd_steps = gr.Slider(
750
+ label="Diffusion steps",
751
+ minimum=1,
752
+ maximum=1000,
753
+ value=8,
754
+ step=1)
755
+ seed = gr.Slider(
756
+ label="Seed",
757
+ minimum=-1,
758
+ maximum=2147483647,
759
+ step=1,
760
+ value=42)
761
+ with gr.Row():
762
+ text_guide_scale = gr.Slider(
763
+ label="Text Guide scale",
764
+ minimum=0,
765
+ maximum=20,
766
+ value=1.0,
767
+ step=1)
768
+ audio_guide_scale = gr.Slider(
769
+ label="Audio Guide scale",
770
+ minimum=0,
771
+ maximum=20,
772
+ value=2.0,
773
+ step=1)
774
+ with gr.Row():
775
+ human1_voice = gr.Textbox(
776
+ label="Voice for the left person",
777
+ value="weights/Kokoro-82M/voices/am_adam.pt",
778
+ )
779
+ human2_voice = gr.Textbox(
780
+ label="Voice for right person",
781
+ value="weights/Kokoro-82M/voices/af_heart.pt"
782
+ )
783
+ # with gr.Row():
784
+ n_prompt = gr.Textbox(
785
+ label="Negative Prompt",
786
+ placeholder="Describe the negative prompt you want to add",
787
+ value="bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
788
+ )
789
+
790
+ run_i2v_button = gr.Button("Generate Video")
791
+
792
+ with gr.Column(scale=2):
793
+ result_gallery = gr.Video(
794
+ label='Generated Video', interactive=False, height=600, )
795
+
796
+ gr.Examples(
797
+ examples = [
798
+ ['SingleImageDriven', 'examples/single/ref_image.png', None, "A woman is passionately singing into a professional microphone in a recording studio. She wears large black headphones and a dark cardigan over a gray top. Her long, wavy brown hair frames her face as she looks slightly upwards, her mouth open mid-song. The studio is equipped with various audio equipment, including a mixing console and a keyboard, with soundproofing panels on the walls. The lighting is warm and focused on her, creating a professional and intimate atmosphere. A close-up shot captures her expressive performance.", "Single Person(Local File)", "examples/single/1.wav", None, None],
799
+ ['VideoDubbing', None, 'examples/single/ref_video.mp4', "A man is talking", "Single Person(Local File)", "examples/single/1.wav", None, None],
800
+
801
+ ],
802
+ inputs = [task_mode, img2vid_image, vid2vid_vid, img2vid_prompt, mode_selector, img2vid_audio_1, img2vid_audio_2, tts_text],
803
+ )
804
+
805
+
806
+ run_i2v_button.click(
807
+ fn=generate_video,
808
+ inputs=[img2vid_image, vid2vid_vid, task_mode, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2,sd_steps, seed, text_guide_scale, audio_guide_scale, mode_selector, tts_text, resolution_select, human1_voice, human2_voice],
809
+ outputs=[result_gallery],
810
+ )
811
+ demo.launch(server_name="0.0.0.0", debug=True, server_port=8418)
812
+
813
+
814
+
815
+
816
+ if __name__ == "__main__":
817
+ args = _parse_args()
818
+ run_graio_demo(args)
819
+
assets/InfiniteTalk_paper.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcefdbb788a7f10aa941adf642a8f511fbb99b874e8dd271b9067caefa6b41b2
3
+ size 13015738
assets/logo.jpg ADDED
assets/logo2.jpg ADDED

Git LFS Details

  • SHA256: 37cc96290dede43f9bac0d0ab1f6cae20cc431eaa5bf1908a9185720dfca2a3c
  • Pointer size: 131 Bytes
  • Size of remote file: 162 kB
assets/pipeline.png ADDED

Git LFS Details

  • SHA256: 089cac2e05d0858949ec1cfdf0241274790b0eafd9318f8b5810f5ef6008ee72
  • Pointer size: 131 Bytes
  • Size of remote file: 145 kB
examples/multi/1-man.WAV ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d304fd88850d6673649d1844db2894e03bf5a775123048eebcb01ab3b79bff5e
3
+ size 1503276
examples/multi/1-woman.WAV ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e1ebd7ae1587ebc7f0986f8b61e7fcc99c6fb57fbb15ab9373968e701afc8bf
3
+ size 1503276
examples/multi/ref_img.png ADDED

Git LFS Details

  • SHA256: 210b89972b810e760d15828323186771a56f1220e806b09fe06b0584a9f55537
  • Pointer size: 132 Bytes
  • Size of remote file: 3 MB
examples/multi_example_image.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "In a casual, intimate setting, a man and a woman are engaged in a heartfelt conversation inside a car. The man, sporting a denim jacket over a blue shirt, sits attentively with a seatbelt fastened, his gaze fixed on the woman beside him. The woman, wearing a black tank top and a denim jacket draped over her shoulders, smiles warmly, her eyes reflecting genuine interest and connection. The car's interior, with its beige seats and simple design, provides a backdrop that emphasizes their interaction. The scene captures a moment of shared understanding and connection, set against the soft, diffused light of an overcast day. A medium shot from a slightly angled perspective, focusing on their expressions and body language.",
3
+ "cond_video": "examples/multi/ref_img.png",
4
+ "audio_type": "para",
5
+ "cond_audio": {
6
+ "person1": "examples/multi/1-man.WAV",
7
+ "person2": "examples/multi/1-woman.WAV"
8
+ }
9
+ }
examples/single/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba2733897f561f747e6508734bff4eeee29d0a73638e5c39c0c0b806701d4e8b
3
+ size 1888320
examples/single/ref_image.png ADDED

Git LFS Details

  • SHA256: 5a47d458721c4a7419d3c8ef9a5c3d89cf161ab31de9451b9bb4f321a37bc705
  • Pointer size: 132 Bytes
  • Size of remote file: 2.79 MB
examples/single/ref_video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cb07cbfa63576d8b06eb2954cc56d1b089764f0a9428da867348810d6cb9071
3
+ size 843790
examples/single_example_image.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "A woman is passionately singing into a professional microphone in a recording studio. She wears large black headphones and a dark cardigan over a gray top. Her long, wavy brown hair frames her face as she looks slightly upwards, her mouth open mid-song. The studio is equipped with various audio equipment, including a mixing console and a keyboard, with soundproofing panels on the walls. The lighting is warm and focused on her, creating a professional and intimate atmosphere. A close-up shot captures her expressive performance.",
3
+ "cond_video": "examples/single/ref_image.png",
4
+ "cond_audio": {
5
+ "person1": "examples/single/1.wav"
6
+ }
7
+ }
examples/single_example_video.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "A man is talking",
3
+ "cond_video": "examples/single/ref_video.mp4",
4
+ "cond_audio": {
5
+ "person1": "examples/single/1.wav"
6
+ }
7
+ }
generate_infinitetalk.py ADDED
@@ -0,0 +1,657 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import argparse
3
+ import logging
4
+ import os
5
+ import sys
6
+ import json
7
+ import warnings
8
+ from datetime import datetime
9
+
10
+ warnings.filterwarnings('ignore')
11
+
12
+ import random
13
+
14
+ import torch
15
+ import torch.distributed as dist
16
+ from PIL import Image
17
+ import subprocess
18
+
19
+ import wan
20
+ from wan.configs import SIZE_CONFIGS, SUPPORTED_SIZES, WAN_CONFIGS
21
+ from wan.utils.utils import str2bool, is_video, split_wav_librosa
22
+ from wan.utils.multitalk_utils import save_video_ffmpeg
23
+ from kokoro import KPipeline
24
+ from transformers import Wav2Vec2FeatureExtractor
25
+ from src.audio_analysis.wav2vec2 import Wav2Vec2Model
26
+ from wan.utils.segvideo import shot_detect
27
+
28
+
29
+ import librosa
30
+ import pyloudnorm as pyln
31
+ import numpy as np
32
+ from einops import rearrange
33
+ import soundfile as sf
34
+ import re
35
+
36
+
37
+ def _validate_args(args):
38
+ # Basic check
39
+ assert args.ckpt_dir is not None, "Please specify the checkpoint directory."
40
+ assert args.task in WAN_CONFIGS, f"Unsupport task: {args.task}"
41
+
42
+ # The default sampling steps are 40 for image-to-video tasks and 50 for text-to-video tasks.
43
+ if args.sample_steps is None:
44
+ args.sample_steps = 40
45
+
46
+ if args.sample_shift is None:
47
+ if args.size == 'infinitetalk-480':
48
+ args.sample_shift = 7
49
+ elif args.size == 'infinitetalk-720':
50
+ args.sample_shift = 11
51
+ else:
52
+ raise NotImplementedError(f'Not supported size')
53
+
54
+ args.base_seed = args.base_seed if args.base_seed >= 0 else random.randint(
55
+ 0, 99999999)
56
+ # Size check
57
+ assert args.size in SUPPORTED_SIZES[
58
+ args.
59
+ task], f"Unsupport size {args.size} for task {args.task}, supported sizes are: {', '.join(SUPPORTED_SIZES[args.task])}"
60
+
61
+
62
+ def _parse_args():
63
+ parser = argparse.ArgumentParser(
64
+ description="Generate a image or video from a text prompt or image using Wan"
65
+ )
66
+ parser.add_argument(
67
+ "--task",
68
+ type=str,
69
+ default="infinitetalk-14B",
70
+ choices=list(WAN_CONFIGS.keys()),
71
+ help="The task to run.")
72
+ parser.add_argument(
73
+ "--size",
74
+ type=str,
75
+ default="infinitetalk-480",
76
+ choices=list(SIZE_CONFIGS.keys()),
77
+ help="The buckget size of the generated video. The aspect ratio of the output video will follow that of the input image."
78
+ )
79
+ parser.add_argument(
80
+ "--frame_num",
81
+ type=int,
82
+ default=81,
83
+ help="How many frames to be generated in one clip. The number should be 4n+1"
84
+ )
85
+ parser.add_argument(
86
+ "--ckpt_dir",
87
+ type=str,
88
+ default=None,
89
+ help="The path to the Wan checkpoint directory.")
90
+ parser.add_argument(
91
+ "--infinitetalk_dir",
92
+ type=str,
93
+ default=None,
94
+ help="The path to the InfiniteTalk checkpoint directory.")
95
+ parser.add_argument(
96
+ "--quant_dir",
97
+ type=str,
98
+ default=None,
99
+ help="The path to the Wan quant checkpoint directory.")
100
+ parser.add_argument(
101
+ "--wav2vec_dir",
102
+ type=str,
103
+ default=None,
104
+ help="The path to the wav2vec checkpoint directory.")
105
+ parser.add_argument(
106
+ "--dit_path",
107
+ type=str,
108
+ default=None,
109
+ help="The path to the Wan checkpoint directory.")
110
+ parser.add_argument(
111
+ "--lora_dir",
112
+ type=str,
113
+ nargs='+',
114
+ default=None,
115
+ help="The paths to the LoRA checkpoint files."
116
+ )
117
+ parser.add_argument(
118
+ "--lora_scale",
119
+ type=float,
120
+ nargs='+',
121
+ default=[1.2],
122
+ help="Controls how much to influence the outputs with the LoRA parameters. Accepts multiple float values."
123
+ )
124
+ parser.add_argument(
125
+ "--offload_model",
126
+ type=str2bool,
127
+ default=None,
128
+ help="Whether to offload the model to CPU after each model forward, reducing GPU memory usage."
129
+ )
130
+ parser.add_argument(
131
+ "--ulysses_size",
132
+ type=int,
133
+ default=1,
134
+ help="The size of the ulysses parallelism in DiT.")
135
+ parser.add_argument(
136
+ "--ring_size",
137
+ type=int,
138
+ default=1,
139
+ help="The size of the ring attention parallelism in DiT.")
140
+ parser.add_argument(
141
+ "--t5_fsdp",
142
+ action="store_true",
143
+ default=False,
144
+ help="Whether to use FSDP for T5.")
145
+ parser.add_argument(
146
+ "--t5_cpu",
147
+ action="store_true",
148
+ default=False,
149
+ help="Whether to place T5 model on CPU.")
150
+ parser.add_argument(
151
+ "--dit_fsdp",
152
+ action="store_true",
153
+ default=False,
154
+ help="Whether to use FSDP for DiT.")
155
+ parser.add_argument(
156
+ "--save_file",
157
+ type=str,
158
+ default=None,
159
+ help="The file to save the generated image or video to.")
160
+ parser.add_argument(
161
+ "--audio_save_dir",
162
+ type=str,
163
+ default='save_audio',
164
+ help="The path to save the audio embedding.")
165
+ parser.add_argument(
166
+ "--base_seed",
167
+ type=int,
168
+ default=42,
169
+ help="The seed to use for generating the image or video.")
170
+ parser.add_argument(
171
+ "--input_json",
172
+ type=str,
173
+ default='examples.json',
174
+ help="[meta file] The condition path to generate the video.")
175
+ parser.add_argument(
176
+ "--motion_frame",
177
+ type=int,
178
+ default=9,
179
+ help="Driven frame length used in the mode of long video genration.")
180
+ parser.add_argument(
181
+ "--mode",
182
+ type=str,
183
+ default="clip",
184
+ choices=['clip', 'streaming'],
185
+ help="clip: generate one video chunk, streaming: long video generation")
186
+ parser.add_argument(
187
+ "--sample_steps", type=int, default=None, help="The sampling steps.")
188
+ parser.add_argument(
189
+ "--sample_shift",
190
+ type=float,
191
+ default=None,
192
+ help="Sampling shift factor for flow matching schedulers.")
193
+ parser.add_argument(
194
+ "--sample_text_guide_scale",
195
+ type=float,
196
+ default=5.0,
197
+ help="Classifier free guidance scale for text control.")
198
+ parser.add_argument(
199
+ "--sample_audio_guide_scale",
200
+ type=float,
201
+ default=4.0,
202
+ help="Classifier free guidance scale for audio control.")
203
+ parser.add_argument(
204
+ "--num_persistent_param_in_dit",
205
+ type=int,
206
+ default=None,
207
+ required=False,
208
+ help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required",
209
+ )
210
+ parser.add_argument(
211
+ "--audio_mode",
212
+ type=str,
213
+ default="localfile",
214
+ choices=['localfile', 'tts'],
215
+ help="localfile: audio from local wav file, tts: audio from TTS")
216
+ parser.add_argument(
217
+ "--use_teacache",
218
+ action="store_true",
219
+ default=False,
220
+ help="Enable teacache for video generation."
221
+ )
222
+ parser.add_argument(
223
+ "--teacache_thresh",
224
+ type=float,
225
+ default=0.2,
226
+ help="Threshold for teacache."
227
+ )
228
+ parser.add_argument(
229
+ "--use_apg",
230
+ action="store_true",
231
+ default=False,
232
+ help="Enable adaptive projected guidance for video generation (APG)."
233
+ )
234
+ parser.add_argument(
235
+ "--apg_momentum",
236
+ type=float,
237
+ default=-0.75,
238
+ help="Momentum used in adaptive projected guidance (APG)."
239
+ )
240
+ parser.add_argument(
241
+ "--apg_norm_threshold",
242
+ type=float,
243
+ default=55,
244
+ help="Norm threshold used in adaptive projected guidance (APG)."
245
+ )
246
+ parser.add_argument(
247
+ "--color_correction_strength",
248
+ type=float,
249
+ default=1.0,
250
+ help="strength for color correction [0.0 -- 1.0]."
251
+ )
252
+ parser.add_argument(
253
+ "--scene_seg",
254
+ action="store_true",
255
+ default=False,
256
+ help="Enable scene segmentation for input video."
257
+ )
258
+ parser.add_argument(
259
+ "--quant",
260
+ type=str,
261
+ default=None,
262
+ help="Quantization type, must be 'int8' or 'fp8'."
263
+ )
264
+
265
+ args = parser.parse_args()
266
+
267
+ _validate_args(args)
268
+
269
+ return args
270
+
271
+ def custom_init(device, wav2vec):
272
+ audio_encoder = Wav2Vec2Model.from_pretrained(wav2vec, local_files_only=True).to(device)
273
+ audio_encoder.feature_extractor._freeze_parameters()
274
+ wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec, local_files_only=True)
275
+ return wav2vec_feature_extractor, audio_encoder
276
+
277
+ def loudness_norm(audio_array, sr=16000, lufs=-23):
278
+ meter = pyln.Meter(sr)
279
+ loudness = meter.integrated_loudness(audio_array)
280
+ if abs(loudness) > 100:
281
+ return audio_array
282
+ normalized_audio = pyln.normalize.loudness(audio_array, loudness, lufs)
283
+ return normalized_audio
284
+
285
+ def audio_prepare_multi(left_path, right_path, audio_type, sample_rate=16000):
286
+
287
+ if not (left_path=='None' or right_path=='None'):
288
+ human_speech_array1 = audio_prepare_single(left_path)
289
+ human_speech_array2 = audio_prepare_single(right_path)
290
+ elif left_path=='None':
291
+ human_speech_array2 = audio_prepare_single(right_path)
292
+ human_speech_array1 = np.zeros(human_speech_array2.shape[0])
293
+ elif right_path=='None':
294
+ human_speech_array1 = audio_prepare_single(left_path)
295
+ human_speech_array2 = np.zeros(human_speech_array1.shape[0])
296
+
297
+ if audio_type=='para':
298
+ new_human_speech1 = human_speech_array1
299
+ new_human_speech2 = human_speech_array2
300
+ elif audio_type=='add':
301
+ new_human_speech1 = np.concatenate([human_speech_array1[: human_speech_array1.shape[0]], np.zeros(human_speech_array2.shape[0])])
302
+ new_human_speech2 = np.concatenate([np.zeros(human_speech_array1.shape[0]), human_speech_array2[:human_speech_array2.shape[0]]])
303
+ sum_human_speechs = new_human_speech1 + new_human_speech2
304
+ return new_human_speech1, new_human_speech2, sum_human_speechs
305
+
306
+ def _init_logging(rank):
307
+ # logging
308
+ if rank == 0:
309
+ # set format
310
+ logging.basicConfig(
311
+ level=logging.INFO,
312
+ format="[%(asctime)s] %(levelname)s: %(message)s",
313
+ handlers=[logging.StreamHandler(stream=sys.stdout)])
314
+ else:
315
+ logging.basicConfig(level=logging.ERROR)
316
+
317
+ def get_embedding(speech_array, wav2vec_feature_extractor, audio_encoder, sr=16000, device='cpu'):
318
+ audio_duration = len(speech_array) / sr
319
+ video_length = audio_duration * 25 # Assume the video fps is 25
320
+
321
+ # wav2vec_feature_extractor
322
+ audio_feature = np.squeeze(
323
+ wav2vec_feature_extractor(speech_array, sampling_rate=sr).input_values
324
+ )
325
+ audio_feature = torch.from_numpy(audio_feature).float().to(device=device)
326
+ audio_feature = audio_feature.unsqueeze(0)
327
+
328
+ # audio encoder
329
+ with torch.no_grad():
330
+ embeddings = audio_encoder(audio_feature, seq_len=int(video_length), output_hidden_states=True)
331
+
332
+ if len(embeddings) == 0:
333
+ print("Fail to extract audio embedding")
334
+ return None
335
+
336
+ audio_emb = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
337
+ audio_emb = rearrange(audio_emb, "b s d -> s b d")
338
+
339
+ audio_emb = audio_emb.cpu().detach()
340
+ return audio_emb
341
+
342
+ def extract_audio_from_video(filename, sample_rate):
343
+ raw_audio_path = filename.split('/')[-1].split('.')[0]+'.wav'
344
+ ffmpeg_command = [
345
+ "ffmpeg",
346
+ "-y",
347
+ "-i",
348
+ str(filename),
349
+ "-vn",
350
+ "-acodec",
351
+ "pcm_s16le",
352
+ "-ar",
353
+ "16000",
354
+ "-ac",
355
+ "2",
356
+ str(raw_audio_path),
357
+ ]
358
+ subprocess.run(ffmpeg_command, check=True)
359
+ human_speech_array, sr = librosa.load(raw_audio_path, sr=sample_rate)
360
+ human_speech_array = loudness_norm(human_speech_array, sr)
361
+ os.remove(raw_audio_path)
362
+
363
+ return human_speech_array
364
+
365
+ def audio_prepare_single(audio_path, sample_rate=16000):
366
+ ext = os.path.splitext(audio_path)[1].lower()
367
+ if ext in ['.mp4', '.mov', '.avi', '.mkv']:
368
+ human_speech_array = extract_audio_from_video(audio_path, sample_rate)
369
+ return human_speech_array
370
+ else:
371
+ human_speech_array, sr = librosa.load(audio_path, sr=sample_rate)
372
+ human_speech_array = loudness_norm(human_speech_array, sr)
373
+ return human_speech_array
374
+
375
+ def process_tts_single(text, save_dir, voice1):
376
+ s1_sentences = []
377
+
378
+ pipeline = KPipeline(lang_code='a', repo_id='weights/Kokoro-82M')
379
+
380
+ voice_tensor = torch.load(voice1, weights_only=True)
381
+ generator = pipeline(
382
+ text, voice=voice_tensor, # <= change voice here
383
+ speed=1, split_pattern=r'\n+'
384
+ )
385
+ audios = []
386
+ for i, (gs, ps, audio) in enumerate(generator):
387
+ audios.append(audio)
388
+ audios = torch.concat(audios, dim=0)
389
+ s1_sentences.append(audios)
390
+ s1_sentences = torch.concat(s1_sentences, dim=0)
391
+ save_path1 =f'{save_dir}/s1.wav'
392
+ sf.write(save_path1, s1_sentences, 24000) # save each audio file
393
+ s1, _ = librosa.load(save_path1, sr=16000)
394
+ return s1, save_path1
395
+
396
+
397
+
398
+ def process_tts_multi(text, save_dir, voice1, voice2):
399
+ pattern = r'\(s(\d+)\)\s*(.*?)(?=\s*\(s\d+\)|$)'
400
+ matches = re.findall(pattern, text, re.DOTALL)
401
+
402
+ s1_sentences = []
403
+ s2_sentences = []
404
+
405
+ pipeline = KPipeline(lang_code='a', repo_id='weights/Kokoro-82M')
406
+ for idx, (speaker, content) in enumerate(matches):
407
+ if speaker == '1':
408
+ voice_tensor = torch.load(voice1, weights_only=True)
409
+ generator = pipeline(
410
+ content, voice=voice_tensor, # <= change voice here
411
+ speed=1, split_pattern=r'\n+'
412
+ )
413
+ audios = []
414
+ for i, (gs, ps, audio) in enumerate(generator):
415
+ audios.append(audio)
416
+ audios = torch.concat(audios, dim=0)
417
+ s1_sentences.append(audios)
418
+ s2_sentences.append(torch.zeros_like(audios))
419
+ elif speaker == '2':
420
+ voice_tensor = torch.load(voice2, weights_only=True)
421
+ generator = pipeline(
422
+ content, voice=voice_tensor, # <= change voice here
423
+ speed=1, split_pattern=r'\n+'
424
+ )
425
+ audios = []
426
+ for i, (gs, ps, audio) in enumerate(generator):
427
+ audios.append(audio)
428
+ audios = torch.concat(audios, dim=0)
429
+ s2_sentences.append(audios)
430
+ s1_sentences.append(torch.zeros_like(audios))
431
+
432
+ s1_sentences = torch.concat(s1_sentences, dim=0)
433
+ s2_sentences = torch.concat(s2_sentences, dim=0)
434
+ sum_sentences = s1_sentences + s2_sentences
435
+ save_path1 =f'{save_dir}/s1.wav'
436
+ save_path2 =f'{save_dir}/s2.wav'
437
+ save_path_sum = f'{save_dir}/sum.wav'
438
+ sf.write(save_path1, s1_sentences, 24000) # save each audio file
439
+ sf.write(save_path2, s2_sentences, 24000)
440
+ sf.write(save_path_sum, sum_sentences, 24000)
441
+
442
+ s1, _ = librosa.load(save_path1, sr=16000)
443
+ s2, _ = librosa.load(save_path2, sr=16000)
444
+ # sum, _ = librosa.load(save_path_sum, sr=16000)
445
+ return s1, s2, save_path_sum
446
+
447
+ def generate(args):
448
+ rank = int(os.getenv("RANK", 0))
449
+ world_size = int(os.getenv("WORLD_SIZE", 1))
450
+ local_rank = int(os.getenv("LOCAL_RANK", 0))
451
+ device = local_rank
452
+ _init_logging(rank)
453
+
454
+ if args.offload_model is None:
455
+ args.offload_model = False if world_size > 1 else True
456
+ logging.info(
457
+ f"offload_model is not specified, set to {args.offload_model}.")
458
+ if world_size > 1:
459
+ torch.cuda.set_device(local_rank)
460
+ dist.init_process_group(
461
+ backend="nccl",
462
+ init_method="env://",
463
+ rank=rank,
464
+ world_size=world_size)
465
+ else:
466
+ assert not (
467
+ args.t5_fsdp or args.dit_fsdp
468
+ ), f"t5_fsdp and dit_fsdp are not supported in non-distributed environments."
469
+ assert not (
470
+ args.ulysses_size > 1 or args.ring_size > 1
471
+ ), f"context parallel are not supported in non-distributed environments."
472
+
473
+ if args.ulysses_size > 1 or args.ring_size > 1:
474
+ assert args.ulysses_size * args.ring_size == world_size, f"The number of ulysses_size and ring_size should be equal to the world size."
475
+ from xfuser.core.distributed import (
476
+ init_distributed_environment,
477
+ initialize_model_parallel,
478
+ )
479
+ init_distributed_environment(
480
+ rank=dist.get_rank(), world_size=dist.get_world_size())
481
+
482
+ initialize_model_parallel(
483
+ sequence_parallel_degree=dist.get_world_size(),
484
+ ring_degree=args.ring_size,
485
+ ulysses_degree=args.ulysses_size,
486
+ )
487
+
488
+ # TODO: use prompt refine
489
+ # if args.use_prompt_extend:
490
+ # if args.prompt_extend_method == "dashscope":
491
+ # prompt_expander = DashScopePromptExpander(
492
+ # model_name=args.prompt_extend_model,
493
+ # is_vl="i2v" in args.task or "flf2v" in args.task)
494
+ # elif args.prompt_extend_method == "local_qwen":
495
+ # prompt_expander = QwenPromptExpander(
496
+ # model_name=args.prompt_extend_model,
497
+ # is_vl="i2v" in args.task,
498
+ # device=rank)
499
+ # else:
500
+ # raise NotImplementedError(
501
+ # f"Unsupport prompt_extend_method: {args.prompt_extend_method}")
502
+
503
+ cfg = WAN_CONFIGS[args.task]
504
+ if args.ulysses_size > 1:
505
+ assert cfg.num_heads % args.ulysses_size == 0, f"`{cfg.num_heads=}` cannot be divided evenly by `{args.ulysses_size=}`."
506
+
507
+ logging.info(f"Generation job args: {args}")
508
+ logging.info(f"Generation model config: {cfg}")
509
+
510
+ if dist.is_initialized():
511
+ base_seed = [args.base_seed] if rank == 0 else [None]
512
+ dist.broadcast_object_list(base_seed, src=0)
513
+ args.base_seed = base_seed[0]
514
+
515
+ assert args.task == "infinitetalk-14B", 'You should choose infinitetalk in args.task.'
516
+
517
+
518
+ logging.info("Creating infinitetalk pipeline.")
519
+ wan_i2v = wan.InfiniteTalkPipeline(
520
+ config=cfg,
521
+ checkpoint_dir=args.ckpt_dir,
522
+ quant_dir=args.quant_dir,
523
+ device_id=device,
524
+ rank=rank,
525
+ t5_fsdp=args.t5_fsdp,
526
+ dit_fsdp=args.dit_fsdp,
527
+ use_usp=(args.ulysses_size > 1 or args.ring_size > 1),
528
+ t5_cpu=args.t5_cpu,
529
+ lora_dir=args.lora_dir,
530
+ lora_scales=args.lora_scale,
531
+ quant=args.quant,
532
+ dit_path=args.dit_path,
533
+ infinitetalk_dir=args.infinitetalk_dir
534
+ )
535
+ if args.num_persistent_param_in_dit is not None:
536
+ wan_i2v.vram_management = True
537
+ wan_i2v.enable_vram_management(
538
+ num_persistent_param_in_dit=args.num_persistent_param_in_dit
539
+ )
540
+
541
+ generated_list = []
542
+ with open(args.input_json, 'r', encoding='utf-8') as f:
543
+ input_data = json.load(f)
544
+
545
+ wav2vec_feature_extractor, audio_encoder= custom_init('cpu', args.wav2vec_dir)
546
+ args.audio_save_dir = os.path.join(args.audio_save_dir, input_data['cond_video'].split('/')[-1].split('.')[0])
547
+ os.makedirs(args.audio_save_dir,exist_ok=True)
548
+
549
+ conds_list = []
550
+
551
+ if args.scene_seg and is_video(input_data['cond_video']):
552
+ time_list, cond_list = shot_detect(input_data['cond_video'], args.audio_save_dir)
553
+ if len(time_list)==0:
554
+ conds_list.append([input_data['cond_video']])
555
+ conds_list.append([input_data['cond_audio']['person1']])
556
+ if len(input_data['cond_audio'])==2:
557
+ conds_list.append([input_data['cond_audio']['person2']])
558
+ else:
559
+ audio1_list = split_wav_librosa(input_data['cond_audio']['person1'], time_list, args.audio_save_dir)
560
+ conds_list.append(cond_list)
561
+ conds_list.append(audio1_list)
562
+ if len(input_data['cond_audio'])==2:
563
+ audio2_list = split_wav_librosa(input_data['cond_audio']['person2'], time_list, args.audio_save_dir)
564
+ conds_list.append(audio2_list)
565
+ else:
566
+ conds_list.append([input_data['cond_video']])
567
+ conds_list.append([input_data['cond_audio']['person1']])
568
+ if len(input_data['cond_audio'])==2:
569
+ conds_list.append([input_data['cond_audio']['person2']])
570
+
571
+ if len(input_data['cond_audio'])==2:
572
+ new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(input_data['cond_audio']['person1'], input_data['cond_audio']['person2'], input_data['audio_type'])
573
+ sum_audio = os.path.join(args.audio_save_dir, 'sum_all.wav')
574
+ sf.write(sum_audio, sum_human_speechs, 16000)
575
+ input_data['video_audio'] = sum_audio
576
+ else:
577
+ human_speech = audio_prepare_single(input_data['cond_audio']['person1'])
578
+ sum_audio = os.path.join(args.audio_save_dir, 'sum_all.wav')
579
+ sf.write(sum_audio, human_speech, 16000)
580
+ input_data['video_audio'] = sum_audio
581
+ logging.info("Generating video ...")
582
+
583
+ for idx, items in enumerate(zip(*conds_list)):
584
+ print(items)
585
+ input_clip = {}
586
+ input_clip['prompt'] = input_data['prompt']
587
+ input_clip['cond_video'] = items[0]
588
+
589
+ if 'audio_type' in input_data:
590
+ input_clip['audio_type'] = input_data['audio_type']
591
+ if 'bbox' in input_data:
592
+ input_clip['bbox'] = input_data['bbox']
593
+ cond_audio = {}
594
+ if args.audio_mode=='localfile':
595
+ if len(input_data['cond_audio'])==2:
596
+ new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(items[1], items[2], input_data['audio_type'])
597
+ audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
598
+ audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
599
+ emb1_path = os.path.join(args.audio_save_dir, '1.pt')
600
+ emb2_path = os.path.join(args.audio_save_dir, '2.pt')
601
+ sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
602
+ sf.write(sum_audio, sum_human_speechs, 16000)
603
+ torch.save(audio_embedding_1, emb1_path)
604
+ torch.save(audio_embedding_2, emb2_path)
605
+ cond_audio['person1'] = emb1_path
606
+ cond_audio['person2'] = emb2_path
607
+ input_clip['video_audio'] = sum_audio
608
+ v_length = audio_embedding_1.shape[0]
609
+ elif len(input_data['cond_audio'])==1:
610
+ human_speech = audio_prepare_single(items[1])
611
+ audio_embedding = get_embedding(human_speech, wav2vec_feature_extractor, audio_encoder)
612
+ emb_path = os.path.join(args.audio_save_dir, '1.pt')
613
+ sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
614
+ sf.write(sum_audio, human_speech, 16000)
615
+ torch.save(audio_embedding, emb_path)
616
+ cond_audio['person1'] = emb_path
617
+ input_clip['video_audio'] = sum_audio
618
+ v_length = audio_embedding.shape[0]
619
+
620
+ input_clip['cond_audio'] = cond_audio
621
+
622
+ video = wan_i2v.generate_infinitetalk(
623
+ input_clip,
624
+ size_buckget=args.size,
625
+ motion_frame=args.motion_frame,
626
+ frame_num=args.frame_num,
627
+ shift=args.sample_shift,
628
+ sampling_steps=args.sample_steps,
629
+ text_guide_scale=args.sample_text_guide_scale,
630
+ audio_guide_scale=args.sample_audio_guide_scale,
631
+ seed=args.base_seed,
632
+ offload_model=args.offload_model,
633
+ max_frames_num=args.frame_num if args.mode == 'clip' else 1000,
634
+ color_correction_strength = args.color_correction_strength,
635
+ extra_args=args,
636
+ )
637
+
638
+ generated_list.append(video)
639
+
640
+ if rank == 0:
641
+
642
+ if args.save_file is None:
643
+ formatted_time = datetime.now().strftime("%Y%m%d_%H%M%S")
644
+ formatted_prompt = input_clip['prompt'].replace(" ", "_").replace("/",
645
+ "_")[:50]
646
+ args.save_file = f"{args.task}_{args.size.replace('*','x') if sys.platform=='win32' else args.size}_{args.ulysses_size}_{args.ring_size}_{formatted_prompt}_{formatted_time}"
647
+
648
+ sum_video = torch.cat(generated_list, dim=1)
649
+ save_video_ffmpeg(sum_video, args.save_file, [input_data['video_audio']], high_quality_save=False)
650
+
651
+ logging.info(f"Saving generated video to {args.save_file}.mp4")
652
+ logging.info("Finished.")
653
+
654
+
655
+ if __name__ == "__main__":
656
+ args = _parse_args()
657
+ generate(args)
kokoro/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __version__ = '0.9.4'
2
+
3
+ from loguru import logger
4
+ import sys
5
+
6
+ # Remove default handler
7
+ logger.remove()
8
+
9
+ # Add custom handler with clean format including module and line number
10
+ logger.add(
11
+ sys.stderr,
12
+ format="<green>{time:HH:mm:ss}</green> | <cyan>{module:>16}:{line}</cyan> | <level>{level: >8}</level> | <level>{message}</level>",
13
+ colorize=True,
14
+ level="INFO" # "DEBUG" to enable logger.debug("message") and up prints
15
+ # "ERROR" to enable only logger.error("message") prints
16
+ # etc
17
+ )
18
+
19
+ # Disable before release or as needed
20
+ logger.disable("kokoro")
21
+
22
+ from .model import KModel
23
+ from .pipeline import KPipeline
kokoro/__main__.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Kokoro TTS CLI
2
+ Example usage:
3
+ python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
4
+
5
+ echo "Bom dia mundo, como vão vocês" > text.txt
6
+ python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
7
+
8
+ Common issues:
9
+ pip not installed: `uv pip install pip`
10
+ (Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
11
+
12
+ espeak not installed: `apt-get install espeak-ng`
13
+ """
14
+
15
+ import argparse
16
+ import wave
17
+ from pathlib import Path
18
+ from typing import Generator, TYPE_CHECKING
19
+
20
+ import numpy as np
21
+ from loguru import logger
22
+
23
+ languages = [
24
+ "a", # American English
25
+ "b", # British English
26
+ "h", # Hindi
27
+ "e", # Spanish
28
+ "f", # French
29
+ "i", # Italian
30
+ "p", # Brazilian Portuguese
31
+ "j", # Japanese
32
+ "z", # Mandarin Chinese
33
+ ]
34
+
35
+ if TYPE_CHECKING:
36
+ from kokoro import KPipeline
37
+
38
+
39
+ def generate_audio(
40
+ text: str, kokoro_language: str, voice: str, speed=1
41
+ ) -> Generator["KPipeline.Result", None, None]:
42
+ from kokoro import KPipeline
43
+
44
+ if not voice.startswith(kokoro_language):
45
+ logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
46
+ pipeline = KPipeline(lang_code=kokoro_language)
47
+ yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
48
+
49
+
50
+ def generate_and_save_audio(
51
+ output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
52
+ ) -> None:
53
+ with wave.open(str(output_file.resolve()), "wb") as wav_file:
54
+ wav_file.setnchannels(1) # Mono audio
55
+ wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
56
+ wav_file.setframerate(24000) # Sample rate
57
+
58
+ for result in generate_audio(
59
+ text, kokoro_language=kokoro_language, voice=voice, speed=speed
60
+ ):
61
+ logger.debug(result.phonemes)
62
+ if result.audio is None:
63
+ continue
64
+ audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
65
+ wav_file.writeframes(audio_bytes)
66
+
67
+
68
+ def main() -> None:
69
+ parser = argparse.ArgumentParser()
70
+ parser.add_argument(
71
+ "-m",
72
+ "--voice",
73
+ default="af_heart",
74
+ help="Voice to use",
75
+ )
76
+ parser.add_argument(
77
+ "-l",
78
+ "--language",
79
+ help="Language to use (defaults to the one corresponding to the voice)",
80
+ choices=languages,
81
+ )
82
+ parser.add_argument(
83
+ "-o",
84
+ "--output-file",
85
+ "--output_file",
86
+ type=Path,
87
+ help="Path to output WAV file",
88
+ required=True,
89
+ )
90
+ parser.add_argument(
91
+ "-i",
92
+ "--input-file",
93
+ "--input_file",
94
+ type=Path,
95
+ help="Path to input text file (default: stdin)",
96
+ )
97
+ parser.add_argument(
98
+ "-t",
99
+ "--text",
100
+ help="Text to use instead of reading from stdin",
101
+ )
102
+ parser.add_argument(
103
+ "-s",
104
+ "--speed",
105
+ type=float,
106
+ default=1.0,
107
+ help="Speech speed",
108
+ )
109
+ parser.add_argument(
110
+ "--debug",
111
+ action="store_true",
112
+ help="Print DEBUG messages to console",
113
+ )
114
+ args = parser.parse_args()
115
+ if args.debug:
116
+ logger.level("DEBUG")
117
+ logger.debug(args)
118
+
119
+ lang = args.language or args.voice[0]
120
+
121
+ if args.text is not None and args.input_file is not None:
122
+ raise Exception("You cannot specify both 'text' and 'input_file'")
123
+ elif args.text:
124
+ text = args.text
125
+ elif args.input_file:
126
+ file: Path = args.input_file
127
+ text = file.read_text()
128
+ else:
129
+ import sys
130
+ print("Press Ctrl+D to stop reading input and start generating", flush=True)
131
+ text = '\n'.join(sys.stdin)
132
+
133
+ logger.debug(f"Input text: {text!r}")
134
+
135
+ out_file: Path = args.output_file
136
+ if not out_file.suffix == ".wav":
137
+ logger.warning("The output file name should end with .wav")
138
+ generate_and_save_audio(
139
+ output_file=out_file,
140
+ text=text,
141
+ kokoro_language=lang,
142
+ voice=args.voice,
143
+ speed=args.speed,
144
+ )
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
kokoro/custom_stft.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from attr import attr
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+ class CustomSTFT(nn.Module):
8
+ """
9
+ STFT/iSTFT without unfold/complex ops, using conv1d and conv_transpose1d.
10
+
11
+ - forward STFT => Real-part conv1d + Imag-part conv1d
12
+ - inverse STFT => Real-part conv_transpose1d + Imag-part conv_transpose1d + sum
13
+ - avoids F.unfold, so easier to export to ONNX
14
+ - uses replicate or constant padding for 'center=True' to approximate 'reflect'
15
+ (reflect is not supported for dynamic shapes in ONNX)
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ filter_length=800,
21
+ hop_length=200,
22
+ win_length=800,
23
+ window="hann",
24
+ center=True,
25
+ pad_mode="replicate", # or 'constant'
26
+ ):
27
+ super().__init__()
28
+ self.filter_length = filter_length
29
+ self.hop_length = hop_length
30
+ self.win_length = win_length
31
+ self.n_fft = filter_length
32
+ self.center = center
33
+ self.pad_mode = pad_mode
34
+
35
+ # Number of frequency bins for real-valued STFT with onesided=True
36
+ self.freq_bins = self.n_fft // 2 + 1
37
+
38
+ # Build window
39
+ assert window == 'hann', window
40
+ window_tensor = torch.hann_window(win_length, periodic=True, dtype=torch.float32)
41
+ if self.win_length < self.n_fft:
42
+ # Zero-pad up to n_fft
43
+ extra = self.n_fft - self.win_length
44
+ window_tensor = F.pad(window_tensor, (0, extra))
45
+ elif self.win_length > self.n_fft:
46
+ window_tensor = window_tensor[: self.n_fft]
47
+ self.register_buffer("window", window_tensor)
48
+
49
+ # Precompute forward DFT (real, imag)
50
+ # PyTorch stft uses e^{-j 2 pi k n / N} => real=cos(...), imag=-sin(...)
51
+ n = np.arange(self.n_fft)
52
+ k = np.arange(self.freq_bins)
53
+ angle = 2 * np.pi * np.outer(k, n) / self.n_fft # shape (freq_bins, n_fft)
54
+ dft_real = np.cos(angle)
55
+ dft_imag = -np.sin(angle) # note negative sign
56
+
57
+ # Combine window and dft => shape (freq_bins, filter_length)
58
+ # We'll make 2 conv weight tensors of shape (freq_bins, 1, filter_length).
59
+ forward_window = window_tensor.numpy() # shape (n_fft,)
60
+ forward_real = dft_real * forward_window # (freq_bins, n_fft)
61
+ forward_imag = dft_imag * forward_window
62
+
63
+ # Convert to PyTorch
64
+ forward_real_torch = torch.from_numpy(forward_real).float()
65
+ forward_imag_torch = torch.from_numpy(forward_imag).float()
66
+
67
+ # Register as Conv1d weight => (out_channels, in_channels, kernel_size)
68
+ # out_channels = freq_bins, in_channels=1, kernel_size=n_fft
69
+ self.register_buffer(
70
+ "weight_forward_real", forward_real_torch.unsqueeze(1)
71
+ )
72
+ self.register_buffer(
73
+ "weight_forward_imag", forward_imag_torch.unsqueeze(1)
74
+ )
75
+
76
+ # Precompute inverse DFT
77
+ # Real iFFT formula => scale = 1/n_fft, doubling for bins 1..freq_bins-2 if n_fft even, etc.
78
+ # For simplicity, we won't do the "DC/nyquist not doubled" approach here.
79
+ # If you want perfect real iSTFT, you can add that logic.
80
+ # This version just yields good approximate reconstruction with Hann + typical overlap.
81
+ inv_scale = 1.0 / self.n_fft
82
+ n = np.arange(self.n_fft)
83
+ angle_t = 2 * np.pi * np.outer(n, k) / self.n_fft # shape (n_fft, freq_bins)
84
+ idft_cos = np.cos(angle_t).T # => (freq_bins, n_fft)
85
+ idft_sin = np.sin(angle_t).T # => (freq_bins, n_fft)
86
+
87
+ # Multiply by window again for typical overlap-add
88
+ # We also incorporate the scale factor 1/n_fft
89
+ inv_window = window_tensor.numpy() * inv_scale
90
+ backward_real = idft_cos * inv_window # (freq_bins, n_fft)
91
+ backward_imag = idft_sin * inv_window
92
+
93
+ # We'll implement iSTFT as real+imag conv_transpose with stride=hop.
94
+ self.register_buffer(
95
+ "weight_backward_real", torch.from_numpy(backward_real).float().unsqueeze(1)
96
+ )
97
+ self.register_buffer(
98
+ "weight_backward_imag", torch.from_numpy(backward_imag).float().unsqueeze(1)
99
+ )
100
+
101
+
102
+
103
+ def transform(self, waveform: torch.Tensor):
104
+ """
105
+ Forward STFT => returns magnitude, phase
106
+ Output shape => (batch, freq_bins, frames)
107
+ """
108
+ # waveform shape => (B, T). conv1d expects (B, 1, T).
109
+ # Optional center pad
110
+ if self.center:
111
+ pad_len = self.n_fft // 2
112
+ waveform = F.pad(waveform, (pad_len, pad_len), mode=self.pad_mode)
113
+
114
+ x = waveform.unsqueeze(1) # => (B, 1, T)
115
+ # Convolution to get real part => shape (B, freq_bins, frames)
116
+ real_out = F.conv1d(
117
+ x,
118
+ self.weight_forward_real,
119
+ bias=None,
120
+ stride=self.hop_length,
121
+ padding=0,
122
+ )
123
+ # Imag part
124
+ imag_out = F.conv1d(
125
+ x,
126
+ self.weight_forward_imag,
127
+ bias=None,
128
+ stride=self.hop_length,
129
+ padding=0,
130
+ )
131
+
132
+ # magnitude, phase
133
+ magnitude = torch.sqrt(real_out**2 + imag_out**2 + 1e-14)
134
+ phase = torch.atan2(imag_out, real_out)
135
+ # Handle the case where imag_out is 0 and real_out is negative to correct ONNX atan2 to match PyTorch
136
+ # In this case, PyTorch returns pi, ONNX returns -pi
137
+ correction_mask = (imag_out == 0) & (real_out < 0)
138
+ phase[correction_mask] = torch.pi
139
+ return magnitude, phase
140
+
141
+
142
+ def inverse(self, magnitude: torch.Tensor, phase: torch.Tensor, length=None):
143
+ """
144
+ Inverse STFT => returns waveform shape (B, T).
145
+ """
146
+ # magnitude, phase => (B, freq_bins, frames)
147
+ # Re-create real/imag => shape (B, freq_bins, frames)
148
+ real_part = magnitude * torch.cos(phase)
149
+ imag_part = magnitude * torch.sin(phase)
150
+
151
+ # conv_transpose wants shape (B, freq_bins, frames). We'll treat "frames" as time dimension
152
+ # so we do (B, freq_bins, frames) => (B, freq_bins, frames)
153
+ # But PyTorch conv_transpose1d expects (B, in_channels, input_length)
154
+ real_part = real_part # (B, freq_bins, frames)
155
+ imag_part = imag_part
156
+
157
+ # real iSTFT => convolve with "backward_real", "backward_imag", and sum
158
+ # We'll do 2 conv_transpose calls, each giving (B, 1, time),
159
+ # then add them => (B, 1, time).
160
+ real_rec = F.conv_transpose1d(
161
+ real_part,
162
+ self.weight_backward_real, # shape (freq_bins, 1, filter_length)
163
+ bias=None,
164
+ stride=self.hop_length,
165
+ padding=0,
166
+ )
167
+ imag_rec = F.conv_transpose1d(
168
+ imag_part,
169
+ self.weight_backward_imag,
170
+ bias=None,
171
+ stride=self.hop_length,
172
+ padding=0,
173
+ )
174
+ # sum => (B, 1, time)
175
+ waveform = real_rec - imag_rec # typical real iFFT has minus for imaginary part
176
+
177
+ # If we used "center=True" in forward, we should remove pad
178
+ if self.center:
179
+ pad_len = self.n_fft // 2
180
+ # Because of transposed convolution, total length might have extra samples
181
+ # We remove `pad_len` from start & end if possible
182
+ waveform = waveform[..., pad_len:-pad_len]
183
+
184
+ # If a specific length is desired, clamp
185
+ if length is not None:
186
+ waveform = waveform[..., :length]
187
+
188
+ # shape => (B, T)
189
+ return waveform
190
+
191
+ def forward(self, x: torch.Tensor):
192
+ """
193
+ Full STFT -> iSTFT pass: returns time-domain reconstruction.
194
+ Same interface as your original code.
195
+ """
196
+ mag, phase = self.transform(x)
197
+ return self.inverse(mag, phase, length=x.shape[-1])
kokoro/istftnet.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ADAPTED from https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py
2
+ from kokoro.custom_stft import CustomSTFT
3
+ from torch.nn.utils import weight_norm
4
+ import math
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+
9
+
10
+ # https://github.com/yl4579/StyleTTS2/blob/main/Modules/utils.py
11
+ def init_weights(m, mean=0.0, std=0.01):
12
+ classname = m.__class__.__name__
13
+ if classname.find("Conv") != -1:
14
+ m.weight.data.normal_(mean, std)
15
+
16
+ def get_padding(kernel_size, dilation=1):
17
+ return int((kernel_size*dilation - dilation)/2)
18
+
19
+
20
+ class AdaIN1d(nn.Module):
21
+ def __init__(self, style_dim, num_features):
22
+ super().__init__()
23
+ # affine should be False, however there's a bug in the old torch.onnx.export (not newer dynamo) that causes the channel dimension to be lost if affine=False. When affine is true, there's additional learnably parameters. This shouldn't really matter setting it to True, since we're in inference mode
24
+ self.norm = nn.InstanceNorm1d(num_features, affine=True)
25
+ self.fc = nn.Linear(style_dim, num_features*2)
26
+
27
+ def forward(self, x, s):
28
+ h = self.fc(s)
29
+ h = h.view(h.size(0), h.size(1), 1)
30
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
31
+ return (1 + gamma) * self.norm(x) + beta
32
+
33
+
34
+ class AdaINResBlock1(nn.Module):
35
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
36
+ super(AdaINResBlock1, self).__init__()
37
+ self.convs1 = nn.ModuleList([
38
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
39
+ padding=get_padding(kernel_size, dilation[0]))),
40
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
41
+ padding=get_padding(kernel_size, dilation[1]))),
42
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
43
+ padding=get_padding(kernel_size, dilation[2])))
44
+ ])
45
+ self.convs1.apply(init_weights)
46
+ self.convs2 = nn.ModuleList([
47
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=1,
48
+ padding=get_padding(kernel_size, 1))),
49
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=1,
50
+ padding=get_padding(kernel_size, 1))),
51
+ weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=1,
52
+ padding=get_padding(kernel_size, 1)))
53
+ ])
54
+ self.convs2.apply(init_weights)
55
+ self.adain1 = nn.ModuleList([
56
+ AdaIN1d(style_dim, channels),
57
+ AdaIN1d(style_dim, channels),
58
+ AdaIN1d(style_dim, channels),
59
+ ])
60
+ self.adain2 = nn.ModuleList([
61
+ AdaIN1d(style_dim, channels),
62
+ AdaIN1d(style_dim, channels),
63
+ AdaIN1d(style_dim, channels),
64
+ ])
65
+ self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
66
+ self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
67
+
68
+ def forward(self, x, s):
69
+ for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
70
+ xt = n1(x, s)
71
+ xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D
72
+ xt = c1(xt)
73
+ xt = n2(xt, s)
74
+ xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D
75
+ xt = c2(xt)
76
+ x = xt + x
77
+ return x
78
+
79
+
80
+ class TorchSTFT(nn.Module):
81
+ def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
82
+ super().__init__()
83
+ self.filter_length = filter_length
84
+ self.hop_length = hop_length
85
+ self.win_length = win_length
86
+ assert window == 'hann', window
87
+ self.window = torch.hann_window(win_length, periodic=True, dtype=torch.float32)
88
+
89
+ def transform(self, input_data):
90
+ forward_transform = torch.stft(
91
+ input_data,
92
+ self.filter_length, self.hop_length, self.win_length, window=self.window.to(input_data.device),
93
+ return_complex=True)
94
+ return torch.abs(forward_transform), torch.angle(forward_transform)
95
+
96
+ def inverse(self, magnitude, phase):
97
+ inverse_transform = torch.istft(
98
+ magnitude * torch.exp(phase * 1j),
99
+ self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device))
100
+ return inverse_transform.unsqueeze(-2) # unsqueeze to stay consistent with conv_transpose1d implementation
101
+
102
+ def forward(self, input_data):
103
+ self.magnitude, self.phase = self.transform(input_data)
104
+ reconstruction = self.inverse(self.magnitude, self.phase)
105
+ return reconstruction
106
+
107
+
108
+ class SineGen(nn.Module):
109
+ """ Definition of sine generator
110
+ SineGen(samp_rate, harmonic_num = 0,
111
+ sine_amp = 0.1, noise_std = 0.003,
112
+ voiced_threshold = 0,
113
+ flag_for_pulse=False)
114
+ samp_rate: sampling rate in Hz
115
+ harmonic_num: number of harmonic overtones (default 0)
116
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
117
+ noise_std: std of Gaussian noise (default 0.003)
118
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
119
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
120
+ Note: when flag_for_pulse is True, the first time step of a voiced
121
+ segment is always sin(torch.pi) or cos(0)
122
+ """
123
+ def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
124
+ sine_amp=0.1, noise_std=0.003,
125
+ voiced_threshold=0,
126
+ flag_for_pulse=False):
127
+ super(SineGen, self).__init__()
128
+ self.sine_amp = sine_amp
129
+ self.noise_std = noise_std
130
+ self.harmonic_num = harmonic_num
131
+ self.dim = self.harmonic_num + 1
132
+ self.sampling_rate = samp_rate
133
+ self.voiced_threshold = voiced_threshold
134
+ self.flag_for_pulse = flag_for_pulse
135
+ self.upsample_scale = upsample_scale
136
+
137
+ def _f02uv(self, f0):
138
+ # generate uv signal
139
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
140
+ return uv
141
+
142
+ def _f02sine(self, f0_values):
143
+ """ f0_values: (batchsize, length, dim)
144
+ where dim indicates fundamental tone and overtones
145
+ """
146
+ # convert to F0 in rad. The interger part n can be ignored
147
+ # because 2 * torch.pi * n doesn't affect phase
148
+ rad_values = (f0_values / self.sampling_rate) % 1
149
+ # initial phase noise (no noise for fundamental component)
150
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
151
+ rand_ini[:, 0] = 0
152
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
153
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
154
+ if not self.flag_for_pulse:
155
+ rad_values = F.interpolate(rad_values.transpose(1, 2), scale_factor=1/self.upsample_scale, mode="linear").transpose(1, 2)
156
+ phase = torch.cumsum(rad_values, dim=1) * 2 * torch.pi
157
+ phase = F.interpolate(phase.transpose(1, 2) * self.upsample_scale, scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
158
+ sines = torch.sin(phase)
159
+ else:
160
+ # If necessary, make sure that the first time step of every
161
+ # voiced segments is sin(pi) or cos(0)
162
+ # This is used for pulse-train generation
163
+ # identify the last time step in unvoiced segments
164
+ uv = self._f02uv(f0_values)
165
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
166
+ uv_1[:, -1, :] = 1
167
+ u_loc = (uv < 1) * (uv_1 > 0)
168
+ # get the instantanouse phase
169
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
170
+ # different batch needs to be processed differently
171
+ for idx in range(f0_values.shape[0]):
172
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
173
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
174
+ # stores the accumulation of i.phase within
175
+ # each voiced segments
176
+ tmp_cumsum[idx, :, :] = 0
177
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
178
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
179
+ # within the previous voiced segment.
180
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
181
+ # get the sines
182
+ sines = torch.cos(i_phase * 2 * torch.pi)
183
+ return sines
184
+
185
+ def forward(self, f0):
186
+ """ sine_tensor, uv = forward(f0)
187
+ input F0: tensor(batchsize=1, length, dim=1)
188
+ f0 for unvoiced steps should be 0
189
+ output sine_tensor: tensor(batchsize=1, length, dim)
190
+ output uv: tensor(batchsize=1, length, 1)
191
+ """
192
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
193
+ # fundamental component
194
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
195
+ # generate sine waveforms
196
+ sine_waves = self._f02sine(fn) * self.sine_amp
197
+ # generate uv signal
198
+ # uv = torch.ones(f0.shape)
199
+ # uv = uv * (f0 > self.voiced_threshold)
200
+ uv = self._f02uv(f0)
201
+ # noise: for unvoiced should be similar to sine_amp
202
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
203
+ # for voiced regions is self.noise_std
204
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
205
+ noise = noise_amp * torch.randn_like(sine_waves)
206
+ # first: set the unvoiced part to 0 by uv
207
+ # then: additive noise
208
+ sine_waves = sine_waves * uv + noise
209
+ return sine_waves, uv, noise
210
+
211
+
212
+ class SourceModuleHnNSF(nn.Module):
213
+ """ SourceModule for hn-nsf
214
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
215
+ add_noise_std=0.003, voiced_threshod=0)
216
+ sampling_rate: sampling_rate in Hz
217
+ harmonic_num: number of harmonic above F0 (default: 0)
218
+ sine_amp: amplitude of sine source signal (default: 0.1)
219
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
220
+ note that amplitude of noise in unvoiced is decided
221
+ by sine_amp
222
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
223
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
224
+ F0_sampled (batchsize, length, 1)
225
+ Sine_source (batchsize, length, 1)
226
+ noise_source (batchsize, length 1)
227
+ uv (batchsize, length, 1)
228
+ """
229
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
230
+ add_noise_std=0.003, voiced_threshod=0):
231
+ super(SourceModuleHnNSF, self).__init__()
232
+ self.sine_amp = sine_amp
233
+ self.noise_std = add_noise_std
234
+ # to produce sine waveforms
235
+ self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
236
+ sine_amp, add_noise_std, voiced_threshod)
237
+ # to merge source harmonics into a single excitation
238
+ self.l_linear = nn.Linear(harmonic_num + 1, 1)
239
+ self.l_tanh = nn.Tanh()
240
+
241
+ def forward(self, x):
242
+ """
243
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
244
+ F0_sampled (batchsize, length, 1)
245
+ Sine_source (batchsize, length, 1)
246
+ noise_source (batchsize, length 1)
247
+ """
248
+ # source for harmonic branch
249
+ with torch.no_grad():
250
+ sine_wavs, uv, _ = self.l_sin_gen(x)
251
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
252
+ # source for noise branch, in the same shape as uv
253
+ noise = torch.randn_like(uv) * self.sine_amp / 3
254
+ return sine_merge, noise, uv
255
+
256
+
257
+ class Generator(nn.Module):
258
+ def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, disable_complex=False):
259
+ super(Generator, self).__init__()
260
+ self.num_kernels = len(resblock_kernel_sizes)
261
+ self.num_upsamples = len(upsample_rates)
262
+ self.m_source = SourceModuleHnNSF(
263
+ sampling_rate=24000,
264
+ upsample_scale=math.prod(upsample_rates) * gen_istft_hop_size,
265
+ harmonic_num=8, voiced_threshod=10)
266
+ self.f0_upsamp = nn.Upsample(scale_factor=math.prod(upsample_rates) * gen_istft_hop_size)
267
+ self.noise_convs = nn.ModuleList()
268
+ self.noise_res = nn.ModuleList()
269
+ self.ups = nn.ModuleList()
270
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
271
+ self.ups.append(weight_norm(
272
+ nn.ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
273
+ k, u, padding=(k-u)//2)))
274
+ self.resblocks = nn.ModuleList()
275
+ for i in range(len(self.ups)):
276
+ ch = upsample_initial_channel//(2**(i+1))
277
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes,resblock_dilation_sizes)):
278
+ self.resblocks.append(AdaINResBlock1(ch, k, d, style_dim))
279
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
280
+ if i + 1 < len(upsample_rates):
281
+ stride_f0 = math.prod(upsample_rates[i + 1:])
282
+ self.noise_convs.append(nn.Conv1d(
283
+ gen_istft_n_fft + 2, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
284
+ self.noise_res.append(AdaINResBlock1(c_cur, 7, [1,3,5], style_dim))
285
+ else:
286
+ self.noise_convs.append(nn.Conv1d(gen_istft_n_fft + 2, c_cur, kernel_size=1))
287
+ self.noise_res.append(AdaINResBlock1(c_cur, 11, [1,3,5], style_dim))
288
+ self.post_n_fft = gen_istft_n_fft
289
+ self.conv_post = weight_norm(nn.Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3))
290
+ self.ups.apply(init_weights)
291
+ self.conv_post.apply(init_weights)
292
+ self.reflection_pad = nn.ReflectionPad1d((1, 0))
293
+ self.stft = (
294
+ CustomSTFT(filter_length=gen_istft_n_fft, hop_length=gen_istft_hop_size, win_length=gen_istft_n_fft)
295
+ if disable_complex
296
+ else TorchSTFT(filter_length=gen_istft_n_fft, hop_length=gen_istft_hop_size, win_length=gen_istft_n_fft)
297
+ )
298
+
299
+ def forward(self, x, s, f0):
300
+ with torch.no_grad():
301
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
302
+ har_source, noi_source, uv = self.m_source(f0)
303
+ har_source = har_source.transpose(1, 2).squeeze(1)
304
+ har_spec, har_phase = self.stft.transform(har_source)
305
+ har = torch.cat([har_spec, har_phase], dim=1)
306
+ for i in range(self.num_upsamples):
307
+ x = F.leaky_relu(x, negative_slope=0.1)
308
+ x_source = self.noise_convs[i](har)
309
+ x_source = self.noise_res[i](x_source, s)
310
+ x = self.ups[i](x)
311
+ if i == self.num_upsamples - 1:
312
+ x = self.reflection_pad(x)
313
+ x = x + x_source
314
+ xs = None
315
+ for j in range(self.num_kernels):
316
+ if xs is None:
317
+ xs = self.resblocks[i*self.num_kernels+j](x, s)
318
+ else:
319
+ xs += self.resblocks[i*self.num_kernels+j](x, s)
320
+ x = xs / self.num_kernels
321
+ x = F.leaky_relu(x)
322
+ x = self.conv_post(x)
323
+ spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :])
324
+ phase = torch.sin(x[:, self.post_n_fft // 2 + 1:, :])
325
+ return self.stft.inverse(spec, phase)
326
+
327
+
328
+ class UpSample1d(nn.Module):
329
+ def __init__(self, layer_type):
330
+ super().__init__()
331
+ self.layer_type = layer_type
332
+
333
+ def forward(self, x):
334
+ if self.layer_type == 'none':
335
+ return x
336
+ else:
337
+ return F.interpolate(x, scale_factor=2, mode='nearest')
338
+
339
+
340
+ class AdainResBlk1d(nn.Module):
341
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2), upsample='none', dropout_p=0.0):
342
+ super().__init__()
343
+ self.actv = actv
344
+ self.upsample_type = upsample
345
+ self.upsample = UpSample1d(upsample)
346
+ self.learned_sc = dim_in != dim_out
347
+ self._build_weights(dim_in, dim_out, style_dim)
348
+ self.dropout = nn.Dropout(dropout_p)
349
+ if upsample == 'none':
350
+ self.pool = nn.Identity()
351
+ else:
352
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
353
+
354
+ def _build_weights(self, dim_in, dim_out, style_dim):
355
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
356
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
357
+ self.norm1 = AdaIN1d(style_dim, dim_in)
358
+ self.norm2 = AdaIN1d(style_dim, dim_out)
359
+ if self.learned_sc:
360
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
361
+
362
+ def _shortcut(self, x):
363
+ x = self.upsample(x)
364
+ if self.learned_sc:
365
+ x = self.conv1x1(x)
366
+ return x
367
+
368
+ def _residual(self, x, s):
369
+ x = self.norm1(x, s)
370
+ x = self.actv(x)
371
+ x = self.pool(x)
372
+ x = self.conv1(self.dropout(x))
373
+ x = self.norm2(x, s)
374
+ x = self.actv(x)
375
+ x = self.conv2(self.dropout(x))
376
+ return x
377
+
378
+ def forward(self, x, s):
379
+ out = self._residual(x, s)
380
+ out = (out + self._shortcut(x)) * torch.rsqrt(torch.tensor(2))
381
+ return out
382
+
383
+
384
+ class Decoder(nn.Module):
385
+ def __init__(self, dim_in, style_dim, dim_out,
386
+ resblock_kernel_sizes,
387
+ upsample_rates,
388
+ upsample_initial_channel,
389
+ resblock_dilation_sizes,
390
+ upsample_kernel_sizes,
391
+ gen_istft_n_fft, gen_istft_hop_size,
392
+ disable_complex=False):
393
+ super().__init__()
394
+ self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
395
+ self.decode = nn.ModuleList()
396
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
397
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
398
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
399
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
400
+ self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
401
+ self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
402
+ self.asr_res = nn.Sequential(weight_norm(nn.Conv1d(512, 64, kernel_size=1)))
403
+ self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates,
404
+ upsample_initial_channel, resblock_dilation_sizes,
405
+ upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, disable_complex=disable_complex)
406
+
407
+ def forward(self, asr, F0_curve, N, s):
408
+ F0 = self.F0_conv(F0_curve.unsqueeze(1))
409
+ N = self.N_conv(N.unsqueeze(1))
410
+ x = torch.cat([asr, F0, N], axis=1)
411
+ x = self.encode(x, s)
412
+ asr_res = self.asr_res(asr)
413
+ res = True
414
+ for block in self.decode:
415
+ if res:
416
+ x = torch.cat([x, asr_res, F0, N], axis=1)
417
+ x = block(x, s)
418
+ if block.upsample_type != "none":
419
+ res = False
420
+ x = self.generator(x, s, F0_curve)
421
+ return x
kokoro/model.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .istftnet import Decoder
2
+ from .modules import CustomAlbert, ProsodyPredictor, TextEncoder
3
+ from dataclasses import dataclass
4
+ from huggingface_hub import hf_hub_download
5
+ from loguru import logger
6
+ from transformers import AlbertConfig
7
+ from typing import Dict, Optional, Union
8
+ import json
9
+ import torch
10
+ import os
11
+
12
+ class KModel(torch.nn.Module):
13
+ '''
14
+ KModel is a torch.nn.Module with 2 main responsibilities:
15
+ 1. Init weights, downloading config.json + model.pth from HF if needed
16
+ 2. forward(phonemes: str, ref_s: FloatTensor) -> (audio: FloatTensor)
17
+
18
+ You likely only need one KModel instance, and it can be reused across
19
+ multiple KPipelines to avoid redundant memory allocation.
20
+
21
+ Unlike KPipeline, KModel is language-blind.
22
+
23
+ KModel stores self.vocab and thus knows how to map phonemes -> input_ids,
24
+ so there is no need to repeatedly download config.json outside of KModel.
25
+ '''
26
+
27
+ MODEL_NAMES = {
28
+ 'hexgrad/Kokoro-82M': 'kokoro-v1_0.pth',
29
+ 'hexgrad/Kokoro-82M-v1.1-zh': 'kokoro-v1_1-zh.pth',
30
+ }
31
+
32
+ def __init__(
33
+ self,
34
+ repo_id: Optional[str] = None,
35
+ config: Union[Dict, str, None] = None,
36
+ model: Optional[str] = None,
37
+ disable_complex: bool = False
38
+ ):
39
+ super().__init__()
40
+ if repo_id is None:
41
+ repo_id = 'hexgrad/Kokoro-82M'
42
+ print(f"WARNING: Defaulting repo_id to {repo_id}. Pass repo_id='{repo_id}' to suppress this warning.")
43
+ self.repo_id = repo_id
44
+ if not isinstance(config, dict):
45
+ if not config:
46
+ logger.debug("No config provided, downloading from HF")
47
+ config = hf_hub_download(repo_id=repo_id, filename='config.json')
48
+ with open(config, 'r', encoding='utf-8') as r:
49
+ config = json.load(r)
50
+ logger.debug(f"Loaded config: {config}")
51
+ self.vocab = config['vocab']
52
+ self.bert = CustomAlbert(AlbertConfig(vocab_size=config['n_token'], **config['plbert']))
53
+ self.bert_encoder = torch.nn.Linear(self.bert.config.hidden_size, config['hidden_dim'])
54
+ self.context_length = self.bert.config.max_position_embeddings
55
+ self.predictor = ProsodyPredictor(
56
+ style_dim=config['style_dim'], d_hid=config['hidden_dim'],
57
+ nlayers=config['n_layer'], max_dur=config['max_dur'], dropout=config['dropout']
58
+ )
59
+ self.text_encoder = TextEncoder(
60
+ channels=config['hidden_dim'], kernel_size=config['text_encoder_kernel_size'],
61
+ depth=config['n_layer'], n_symbols=config['n_token']
62
+ )
63
+ self.decoder = Decoder(
64
+ dim_in=config['hidden_dim'], style_dim=config['style_dim'],
65
+ dim_out=config['n_mels'], disable_complex=disable_complex, **config['istftnet']
66
+ )
67
+ if not model:
68
+ try:
69
+ model = hf_hub_download(repo_id=repo_id, filename=KModel.MODEL_NAMES[repo_id])
70
+ except:
71
+ model = os.path.join(repo_id, 'kokoro-v1_0.pth')
72
+ for key, state_dict in torch.load(model, map_location='cpu', weights_only=True).items():
73
+ assert hasattr(self, key), key
74
+ try:
75
+ getattr(self, key).load_state_dict(state_dict)
76
+ except:
77
+ logger.debug(f"Did not load {key} from state_dict")
78
+ state_dict = {k[7:]: v for k, v in state_dict.items()}
79
+ getattr(self, key).load_state_dict(state_dict, strict=False)
80
+
81
+ @property
82
+ def device(self):
83
+ return self.bert.device
84
+
85
+ @dataclass
86
+ class Output:
87
+ audio: torch.FloatTensor
88
+ pred_dur: Optional[torch.LongTensor] = None
89
+
90
+ @torch.no_grad()
91
+ def forward_with_tokens(
92
+ self,
93
+ input_ids: torch.LongTensor,
94
+ ref_s: torch.FloatTensor,
95
+ speed: float = 1
96
+ ) -> tuple[torch.FloatTensor, torch.LongTensor]:
97
+ input_lengths = torch.full(
98
+ (input_ids.shape[0],),
99
+ input_ids.shape[-1],
100
+ device=input_ids.device,
101
+ dtype=torch.long
102
+ )
103
+
104
+ text_mask = torch.arange(input_lengths.max()).unsqueeze(0).expand(input_lengths.shape[0], -1).type_as(input_lengths)
105
+ text_mask = torch.gt(text_mask+1, input_lengths.unsqueeze(1)).to(self.device)
106
+ bert_dur = self.bert(input_ids, attention_mask=(~text_mask).int())
107
+ d_en = self.bert_encoder(bert_dur).transpose(-1, -2)
108
+ s = ref_s[:, 128:]
109
+ d = self.predictor.text_encoder(d_en, s, input_lengths, text_mask)
110
+ x, _ = self.predictor.lstm(d)
111
+ duration = self.predictor.duration_proj(x)
112
+ duration = torch.sigmoid(duration).sum(axis=-1) / speed
113
+ pred_dur = torch.round(duration).clamp(min=1).long().squeeze()
114
+ indices = torch.repeat_interleave(torch.arange(input_ids.shape[1], device=self.device), pred_dur)
115
+ pred_aln_trg = torch.zeros((input_ids.shape[1], indices.shape[0]), device=self.device)
116
+ pred_aln_trg[indices, torch.arange(indices.shape[0])] = 1
117
+ pred_aln_trg = pred_aln_trg.unsqueeze(0).to(self.device)
118
+ en = d.transpose(-1, -2) @ pred_aln_trg
119
+ F0_pred, N_pred = self.predictor.F0Ntrain(en, s)
120
+ t_en = self.text_encoder(input_ids, input_lengths, text_mask)
121
+ asr = t_en @ pred_aln_trg
122
+ audio = self.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze()
123
+ return audio, pred_dur
124
+
125
+ def forward(
126
+ self,
127
+ phonemes: str,
128
+ ref_s: torch.FloatTensor,
129
+ speed: float = 1,
130
+ return_output: bool = False
131
+ ) -> Union['KModel.Output', torch.FloatTensor]:
132
+ input_ids = list(filter(lambda i: i is not None, map(lambda p: self.vocab.get(p), phonemes)))
133
+ logger.debug(f"phonemes: {phonemes} -> input_ids: {input_ids}")
134
+ assert len(input_ids)+2 <= self.context_length, (len(input_ids)+2, self.context_length)
135
+ input_ids = torch.LongTensor([[0, *input_ids, 0]]).to(self.device)
136
+ ref_s = ref_s.to(self.device)
137
+ audio, pred_dur = self.forward_with_tokens(input_ids, ref_s, speed)
138
+ audio = audio.squeeze().cpu()
139
+ pred_dur = pred_dur.cpu() if pred_dur is not None else None
140
+ logger.debug(f"pred_dur: {pred_dur}")
141
+ return self.Output(audio=audio, pred_dur=pred_dur) if return_output else audio
142
+
143
+ class KModelForONNX(torch.nn.Module):
144
+ def __init__(self, kmodel: KModel):
145
+ super().__init__()
146
+ self.kmodel = kmodel
147
+
148
+ def forward(
149
+ self,
150
+ input_ids: torch.LongTensor,
151
+ ref_s: torch.FloatTensor,
152
+ speed: float = 1
153
+ ) -> tuple[torch.FloatTensor, torch.LongTensor]:
154
+ waveform, duration = self.kmodel.forward_with_tokens(input_ids, ref_s, speed)
155
+ return waveform, duration
kokoro/modules.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/yl4579/StyleTTS2/blob/main/models.py
2
+ from .istftnet import AdainResBlk1d
3
+ from torch.nn.utils import weight_norm
4
+ from transformers import AlbertModel
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+
11
+ class LinearNorm(nn.Module):
12
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
13
+ super(LinearNorm, self).__init__()
14
+ self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
15
+ nn.init.xavier_uniform_(self.linear_layer.weight, gain=nn.init.calculate_gain(w_init_gain))
16
+
17
+ def forward(self, x):
18
+ return self.linear_layer(x)
19
+
20
+
21
+ class LayerNorm(nn.Module):
22
+ def __init__(self, channels, eps=1e-5):
23
+ super().__init__()
24
+ self.channels = channels
25
+ self.eps = eps
26
+ self.gamma = nn.Parameter(torch.ones(channels))
27
+ self.beta = nn.Parameter(torch.zeros(channels))
28
+
29
+ def forward(self, x):
30
+ x = x.transpose(1, -1)
31
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
+ return x.transpose(1, -1)
33
+
34
+
35
+ class TextEncoder(nn.Module):
36
+ def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
37
+ super().__init__()
38
+ self.embedding = nn.Embedding(n_symbols, channels)
39
+ padding = (kernel_size - 1) // 2
40
+ self.cnn = nn.ModuleList()
41
+ for _ in range(depth):
42
+ self.cnn.append(nn.Sequential(
43
+ weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
44
+ LayerNorm(channels),
45
+ actv,
46
+ nn.Dropout(0.2),
47
+ ))
48
+ self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
49
+
50
+ def forward(self, x, input_lengths, m):
51
+ x = self.embedding(x) # [B, T, emb]
52
+ x = x.transpose(1, 2) # [B, emb, T]
53
+ m = m.unsqueeze(1)
54
+ x.masked_fill_(m, 0.0)
55
+ for c in self.cnn:
56
+ x = c(x)
57
+ x.masked_fill_(m, 0.0)
58
+ x = x.transpose(1, 2) # [B, T, chn]
59
+ lengths = input_lengths if input_lengths.device == torch.device('cpu') else input_lengths.to('cpu')
60
+ x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
61
+ self.lstm.flatten_parameters()
62
+ x, _ = self.lstm(x)
63
+ x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
64
+ x = x.transpose(-1, -2)
65
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]], device=x.device)
66
+ x_pad[:, :, :x.shape[-1]] = x
67
+ x = x_pad
68
+ x.masked_fill_(m, 0.0)
69
+ return x
70
+
71
+
72
+ class AdaLayerNorm(nn.Module):
73
+ def __init__(self, style_dim, channels, eps=1e-5):
74
+ super().__init__()
75
+ self.channels = channels
76
+ self.eps = eps
77
+ self.fc = nn.Linear(style_dim, channels*2)
78
+
79
+ def forward(self, x, s):
80
+ x = x.transpose(-1, -2)
81
+ x = x.transpose(1, -1)
82
+ h = self.fc(s)
83
+ h = h.view(h.size(0), h.size(1), 1)
84
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
85
+ gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
86
+ x = F.layer_norm(x, (self.channels,), eps=self.eps)
87
+ x = (1 + gamma) * x + beta
88
+ return x.transpose(1, -1).transpose(-1, -2)
89
+
90
+
91
+ class ProsodyPredictor(nn.Module):
92
+ def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
93
+ super().__init__()
94
+ self.text_encoder = DurationEncoder(sty_dim=style_dim, d_model=d_hid,nlayers=nlayers, dropout=dropout)
95
+ self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
96
+ self.duration_proj = LinearNorm(d_hid, max_dur)
97
+ self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
98
+ self.F0 = nn.ModuleList()
99
+ self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
100
+ self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
101
+ self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
102
+ self.N = nn.ModuleList()
103
+ self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
104
+ self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
105
+ self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
106
+ self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
107
+ self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
108
+
109
+ def forward(self, texts, style, text_lengths, alignment, m):
110
+ d = self.text_encoder(texts, style, text_lengths, m)
111
+ m = m.unsqueeze(1)
112
+ lengths = text_lengths if text_lengths.device == torch.device('cpu') else text_lengths.to('cpu')
113
+ x = nn.utils.rnn.pack_padded_sequence(d, lengths, batch_first=True, enforce_sorted=False)
114
+ self.lstm.flatten_parameters()
115
+ x, _ = self.lstm(x)
116
+ x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
117
+ x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]], device=x.device)
118
+ x_pad[:, :x.shape[1], :] = x
119
+ x = x_pad
120
+ duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=False))
121
+ en = (d.transpose(-1, -2) @ alignment)
122
+ return duration.squeeze(-1), en
123
+
124
+ def F0Ntrain(self, x, s):
125
+ x, _ = self.shared(x.transpose(-1, -2))
126
+ F0 = x.transpose(-1, -2)
127
+ for block in self.F0:
128
+ F0 = block(F0, s)
129
+ F0 = self.F0_proj(F0)
130
+ N = x.transpose(-1, -2)
131
+ for block in self.N:
132
+ N = block(N, s)
133
+ N = self.N_proj(N)
134
+ return F0.squeeze(1), N.squeeze(1)
135
+
136
+
137
+ class DurationEncoder(nn.Module):
138
+ def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
139
+ super().__init__()
140
+ self.lstms = nn.ModuleList()
141
+ for _ in range(nlayers):
142
+ self.lstms.append(nn.LSTM(d_model + sty_dim, d_model // 2, num_layers=1, batch_first=True, bidirectional=True, dropout=dropout))
143
+ self.lstms.append(AdaLayerNorm(sty_dim, d_model))
144
+ self.dropout = dropout
145
+ self.d_model = d_model
146
+ self.sty_dim = sty_dim
147
+
148
+ def forward(self, x, style, text_lengths, m):
149
+ masks = m
150
+ x = x.permute(2, 0, 1)
151
+ s = style.expand(x.shape[0], x.shape[1], -1)
152
+ x = torch.cat([x, s], axis=-1)
153
+ x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
154
+ x = x.transpose(0, 1)
155
+ x = x.transpose(-1, -2)
156
+ for block in self.lstms:
157
+ if isinstance(block, AdaLayerNorm):
158
+ x = block(x.transpose(-1, -2), style).transpose(-1, -2)
159
+ x = torch.cat([x, s.permute(1, 2, 0)], axis=1)
160
+ x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
161
+ else:
162
+ lengths = text_lengths if text_lengths.device == torch.device('cpu') else text_lengths.to('cpu')
163
+ x = x.transpose(-1, -2)
164
+ x = nn.utils.rnn.pack_padded_sequence(
165
+ x, lengths, batch_first=True, enforce_sorted=False)
166
+ block.flatten_parameters()
167
+ x, _ = block(x)
168
+ x, _ = nn.utils.rnn.pad_packed_sequence(
169
+ x, batch_first=True)
170
+ x = F.dropout(x, p=self.dropout, training=False)
171
+ x = x.transpose(-1, -2)
172
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]], device=x.device)
173
+ x_pad[:, :, :x.shape[-1]] = x
174
+ x = x_pad
175
+
176
+ return x.transpose(-1, -2)
177
+
178
+
179
+ # https://github.com/yl4579/StyleTTS2/blob/main/Utils/PLBERT/util.py
180
+ class CustomAlbert(AlbertModel):
181
+ def forward(self, *args, **kwargs):
182
+ outputs = super().forward(*args, **kwargs)
183
+ return outputs.last_hidden_state
kokoro/pipeline.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import KModel
2
+ from dataclasses import dataclass
3
+ from huggingface_hub import hf_hub_download
4
+ from loguru import logger
5
+ from misaki import en, espeak
6
+ from typing import Callable, Generator, List, Optional, Tuple, Union
7
+ import re
8
+ import torch
9
+ import os
10
+
11
+ ALIASES = {
12
+ 'en-us': 'a',
13
+ 'en-gb': 'b',
14
+ 'es': 'e',
15
+ 'fr-fr': 'f',
16
+ 'hi': 'h',
17
+ 'it': 'i',
18
+ 'pt-br': 'p',
19
+ 'ja': 'j',
20
+ 'zh': 'z',
21
+ }
22
+
23
+ LANG_CODES = dict(
24
+ # pip install misaki[en]
25
+ a='American English',
26
+ b='British English',
27
+
28
+ # espeak-ng
29
+ e='es',
30
+ f='fr-fr',
31
+ h='hi',
32
+ i='it',
33
+ p='pt-br',
34
+
35
+ # pip install misaki[ja]
36
+ j='Japanese',
37
+
38
+ # pip install misaki[zh]
39
+ z='Mandarin Chinese',
40
+ )
41
+
42
+ class KPipeline:
43
+ '''
44
+ KPipeline is a language-aware support class with 2 main responsibilities:
45
+ 1. Perform language-specific G2P, mapping (and chunking) text -> phonemes
46
+ 2. Manage and store voices, lazily downloaded from HF if needed
47
+
48
+ You are expected to have one KPipeline per language. If you have multiple
49
+ KPipelines, you should reuse one KModel instance across all of them.
50
+
51
+ KPipeline is designed to work with a KModel, but this is not required.
52
+ There are 2 ways to pass an existing model into a pipeline:
53
+ 1. On init: us_pipeline = KPipeline(lang_code='a', model=model)
54
+ 2. On call: us_pipeline(text, voice, model=model)
55
+
56
+ By default, KPipeline will automatically initialize its own KModel. To
57
+ suppress this, construct a "quiet" KPipeline with model=False.
58
+
59
+ A "quiet" KPipeline yields (graphemes, phonemes, None) without generating
60
+ any audio. You can use this to phonemize and chunk your text in advance.
61
+
62
+ A "loud" KPipeline _with_ a model yields (graphemes, phonemes, audio).
63
+ '''
64
+ def __init__(
65
+ self,
66
+ lang_code: str,
67
+ repo_id: Optional[str] = None,
68
+ model: Union[KModel, bool] = True,
69
+ trf: bool = False,
70
+ en_callable: Optional[Callable[[str], str]] = None,
71
+ device: Optional[str] = None
72
+ ):
73
+ """Initialize a KPipeline.
74
+
75
+ Args:
76
+ lang_code: Language code for G2P processing
77
+ model: KModel instance, True to create new model, False for no model
78
+ trf: Whether to use transformer-based G2P
79
+ device: Override default device selection ('cuda' or 'cpu', or None for auto)
80
+ If None, will auto-select cuda if available
81
+ If 'cuda' and not available, will explicitly raise an error
82
+ """
83
+ if repo_id is None:
84
+ repo_id = 'hexgrad/Kokoro-82M'
85
+ print(f"WARNING: Defaulting repo_id to {repo_id}. Pass repo_id='{repo_id}' to suppress this warning.")
86
+ config=None
87
+ else:
88
+ config = os.path.join(repo_id, 'config.json')
89
+ self.repo_id = repo_id
90
+ lang_code = lang_code.lower()
91
+ lang_code = ALIASES.get(lang_code, lang_code)
92
+ assert lang_code in LANG_CODES, (lang_code, LANG_CODES)
93
+ self.lang_code = lang_code
94
+ self.model = None
95
+ if isinstance(model, KModel):
96
+ self.model = model
97
+ elif model:
98
+ if device == 'cuda' and not torch.cuda.is_available():
99
+ raise RuntimeError("CUDA requested but not available")
100
+ if device == 'mps' and not torch.backends.mps.is_available():
101
+ raise RuntimeError("MPS requested but not available")
102
+ if device == 'mps' and os.environ.get('PYTORCH_ENABLE_MPS_FALLBACK') != '1':
103
+ raise RuntimeError("MPS requested but fallback not enabled")
104
+ if device is None:
105
+ if torch.cuda.is_available():
106
+ device = 'cuda'
107
+ elif os.environ.get('PYTORCH_ENABLE_MPS_FALLBACK') == '1' and torch.backends.mps.is_available():
108
+ device = 'mps'
109
+ else:
110
+ device = 'cpu'
111
+ try:
112
+ self.model = KModel(repo_id=repo_id, config=config).to(device).eval()
113
+ except RuntimeError as e:
114
+ if device == 'cuda':
115
+ raise RuntimeError(f"""Failed to initialize model on CUDA: {e}.
116
+ Try setting device='cpu' or check CUDA installation.""")
117
+ raise
118
+ self.voices = {}
119
+ if lang_code in 'ab':
120
+ try:
121
+ fallback = espeak.EspeakFallback(british=lang_code=='b')
122
+ except Exception as e:
123
+ logger.warning("EspeakFallback not Enabled: OOD words will be skipped")
124
+ logger.warning({str(e)})
125
+ fallback = None
126
+ self.g2p = en.G2P(trf=trf, british=lang_code=='b', fallback=fallback, unk='')
127
+ elif lang_code == 'j':
128
+ try:
129
+ from misaki import ja
130
+ self.g2p = ja.JAG2P()
131
+ except ImportError:
132
+ logger.error("You need to `pip install misaki[ja]` to use lang_code='j'")
133
+ raise
134
+ elif lang_code == 'z':
135
+ try:
136
+ from misaki import zh
137
+ self.g2p = zh.ZHG2P(
138
+ version=None if repo_id.endswith('/Kokoro-82M') else '1.1',
139
+ en_callable=en_callable
140
+ )
141
+ except ImportError:
142
+ logger.error("You need to `pip install misaki[zh]` to use lang_code='z'")
143
+ raise
144
+ else:
145
+ language = LANG_CODES[lang_code]
146
+ logger.warning(f"Using EspeakG2P(language='{language}'). Chunking logic not yet implemented, so long texts may be truncated unless you split them with '\\n'.")
147
+ self.g2p = espeak.EspeakG2P(language=language)
148
+
149
+ def load_single_voice(self, voice: str):
150
+ if voice in self.voices:
151
+ return self.voices[voice]
152
+ if voice.endswith('.pt'):
153
+ f = voice
154
+ else:
155
+ f = hf_hub_download(repo_id=self.repo_id, filename=f'voices/{voice}.pt')
156
+ if not voice.startswith(self.lang_code):
157
+ v = LANG_CODES.get(voice, voice)
158
+ p = LANG_CODES.get(self.lang_code, self.lang_code)
159
+ logger.warning(f'Language mismatch, loading {v} voice into {p} pipeline.')
160
+ pack = torch.load(f, weights_only=True)
161
+ self.voices[voice] = pack
162
+ return pack
163
+
164
+ """
165
+ load_voice is a helper function that lazily downloads and loads a voice:
166
+ Single voice can be requested (e.g. 'af_bella') or multiple voices (e.g. 'af_bella,af_jessica').
167
+ If multiple voices are requested, they are averaged.
168
+ Delimiter is optional and defaults to ','.
169
+ """
170
+ def load_voice(self, voice: Union[str, torch.FloatTensor], delimiter: str = ",") -> torch.FloatTensor:
171
+ if isinstance(voice, torch.FloatTensor):
172
+ return voice
173
+ if voice in self.voices:
174
+ return self.voices[voice]
175
+ logger.debug(f"Loading voice: {voice}")
176
+ packs = [self.load_single_voice(v) for v in voice.split(delimiter)]
177
+ if len(packs) == 1:
178
+ return packs[0]
179
+ self.voices[voice] = torch.mean(torch.stack(packs), dim=0)
180
+ return self.voices[voice]
181
+
182
+ @staticmethod
183
+ def tokens_to_ps(tokens: List[en.MToken]) -> str:
184
+ return ''.join(t.phonemes + (' ' if t.whitespace else '') for t in tokens).strip()
185
+
186
+ @staticmethod
187
+ def waterfall_last(
188
+ tokens: List[en.MToken],
189
+ next_count: int,
190
+ waterfall: List[str] = ['!.?…', ':;', ',—'],
191
+ bumps: List[str] = [')', '”']
192
+ ) -> int:
193
+ for w in waterfall:
194
+ z = next((i for i, t in reversed(list(enumerate(tokens))) if t.phonemes in set(w)), None)
195
+ if z is None:
196
+ continue
197
+ z += 1
198
+ if z < len(tokens) and tokens[z].phonemes in bumps:
199
+ z += 1
200
+ if next_count - len(KPipeline.tokens_to_ps(tokens[:z])) <= 510:
201
+ return z
202
+ return len(tokens)
203
+
204
+ @staticmethod
205
+ def tokens_to_text(tokens: List[en.MToken]) -> str:
206
+ return ''.join(t.text + t.whitespace for t in tokens).strip()
207
+
208
+ def en_tokenize(
209
+ self,
210
+ tokens: List[en.MToken]
211
+ ) -> Generator[Tuple[str, str, List[en.MToken]], None, None]:
212
+ tks = []
213
+ pcount = 0
214
+ for t in tokens:
215
+ # American English: ɾ => T
216
+ t.phonemes = '' if t.phonemes is None else t.phonemes#.replace('ɾ', 'T')
217
+ next_ps = t.phonemes + (' ' if t.whitespace else '')
218
+ next_pcount = pcount + len(next_ps.rstrip())
219
+ if next_pcount > 510:
220
+ z = KPipeline.waterfall_last(tks, next_pcount)
221
+ text = KPipeline.tokens_to_text(tks[:z])
222
+ logger.debug(f"Chunking text at {z}: '{text[:30]}{'...' if len(text) > 30 else ''}'")
223
+ ps = KPipeline.tokens_to_ps(tks[:z])
224
+ yield text, ps, tks[:z]
225
+ tks = tks[z:]
226
+ pcount = len(KPipeline.tokens_to_ps(tks))
227
+ if not tks:
228
+ next_ps = next_ps.lstrip()
229
+ tks.append(t)
230
+ pcount += len(next_ps)
231
+ if tks:
232
+ text = KPipeline.tokens_to_text(tks)
233
+ ps = KPipeline.tokens_to_ps(tks)
234
+ yield ''.join(text).strip(), ''.join(ps).strip(), tks
235
+
236
+ @staticmethod
237
+ def infer(
238
+ model: KModel,
239
+ ps: str,
240
+ pack: torch.FloatTensor,
241
+ speed: Union[float, Callable[[int], float]] = 1
242
+ ) -> KModel.Output:
243
+ if callable(speed):
244
+ speed = speed(len(ps))
245
+ return model(ps, pack[len(ps)-1], speed, return_output=True)
246
+
247
+ def generate_from_tokens(
248
+ self,
249
+ tokens: Union[str, List[en.MToken]],
250
+ voice: str,
251
+ speed: float = 1,
252
+ model: Optional[KModel] = None
253
+ ) -> Generator['KPipeline.Result', None, None]:
254
+ """Generate audio from either raw phonemes or pre-processed tokens.
255
+
256
+ Args:
257
+ tokens: Either a phoneme string or list of pre-processed MTokens
258
+ voice: The voice to use for synthesis
259
+ speed: Speech speed modifier (default: 1)
260
+ model: Optional KModel instance (uses pipeline's model if not provided)
261
+
262
+ Yields:
263
+ KPipeline.Result containing the input tokens and generated audio
264
+
265
+ Raises:
266
+ ValueError: If no voice is provided or token sequence exceeds model limits
267
+ """
268
+ model = model or self.model
269
+ if model and voice is None:
270
+ raise ValueError('Specify a voice: pipeline.generate_from_tokens(..., voice="af_heart")')
271
+
272
+ pack = self.load_voice(voice).to(model.device) if model else None
273
+
274
+ # Handle raw phoneme string
275
+ if isinstance(tokens, str):
276
+ logger.debug("Processing phonemes from raw string")
277
+ if len(tokens) > 510:
278
+ raise ValueError(f'Phoneme string too long: {len(tokens)} > 510')
279
+ output = KPipeline.infer(model, tokens, pack, speed) if model else None
280
+ yield self.Result(graphemes='', phonemes=tokens, output=output)
281
+ return
282
+
283
+ logger.debug("Processing MTokens")
284
+ # Handle pre-processed tokens
285
+ for gs, ps, tks in self.en_tokenize(tokens):
286
+ if not ps:
287
+ continue
288
+ elif len(ps) > 510:
289
+ logger.warning(f"Unexpected len(ps) == {len(ps)} > 510 and ps == '{ps}'")
290
+ logger.warning("Truncating to 510 characters")
291
+ ps = ps[:510]
292
+ output = KPipeline.infer(model, ps, pack, speed) if model else None
293
+ if output is not None and output.pred_dur is not None:
294
+ KPipeline.join_timestamps(tks, output.pred_dur)
295
+ yield self.Result(graphemes=gs, phonemes=ps, tokens=tks, output=output)
296
+
297
+ @staticmethod
298
+ def join_timestamps(tokens: List[en.MToken], pred_dur: torch.LongTensor):
299
+ # Multiply by 600 to go from pred_dur frames to sample_rate 24000
300
+ # Equivalent to dividing pred_dur frames by 40 to get timestamp in seconds
301
+ # We will count nice round half-frames, so the divisor is 80
302
+ MAGIC_DIVISOR = 80
303
+ if not tokens or len(pred_dur) < 3:
304
+ # We expect at least 3: <bos>, token, <eos>
305
+ return
306
+ # We track 2 counts, measured in half-frames: (left, right)
307
+ # This way we can cut space characters in half
308
+ # TODO: Is -3 an appropriate offset?
309
+ left = right = 2 * max(0, pred_dur[0].item() - 3)
310
+ # Updates:
311
+ # left = right + (2 * token_dur) + space_dur
312
+ # right = left + space_dur
313
+ i = 1
314
+ for t in tokens:
315
+ if i >= len(pred_dur)-1:
316
+ break
317
+ if not t.phonemes:
318
+ if t.whitespace:
319
+ i += 1
320
+ left = right + pred_dur[i].item()
321
+ right = left + pred_dur[i].item()
322
+ i += 1
323
+ continue
324
+ j = i + len(t.phonemes)
325
+ if j >= len(pred_dur):
326
+ break
327
+ t.start_ts = left / MAGIC_DIVISOR
328
+ token_dur = pred_dur[i: j].sum().item()
329
+ space_dur = pred_dur[j].item() if t.whitespace else 0
330
+ left = right + (2 * token_dur) + space_dur
331
+ t.end_ts = left / MAGIC_DIVISOR
332
+ right = left + space_dur
333
+ i = j + (1 if t.whitespace else 0)
334
+
335
+ @dataclass
336
+ class Result:
337
+ graphemes: str
338
+ phonemes: str
339
+ tokens: Optional[List[en.MToken]] = None
340
+ output: Optional[KModel.Output] = None
341
+ text_index: Optional[int] = None
342
+
343
+ @property
344
+ def audio(self) -> Optional[torch.FloatTensor]:
345
+ return None if self.output is None else self.output.audio
346
+
347
+ @property
348
+ def pred_dur(self) -> Optional[torch.LongTensor]:
349
+ return None if self.output is None else self.output.pred_dur
350
+
351
+ ### MARK: BEGIN BACKWARD COMPAT ###
352
+ def __iter__(self):
353
+ yield self.graphemes
354
+ yield self.phonemes
355
+ yield self.audio
356
+
357
+ def __getitem__(self, index):
358
+ return [self.graphemes, self.phonemes, self.audio][index]
359
+
360
+ def __len__(self):
361
+ return 3
362
+ #### MARK: END BACKWARD COMPAT ####
363
+
364
+ def __call__(
365
+ self,
366
+ text: Union[str, List[str]],
367
+ voice: Optional[str] = None,
368
+ speed: Union[float, Callable[[int], float]] = 1,
369
+ split_pattern: Optional[str] = r'\n+',
370
+ model: Optional[KModel] = None
371
+ ) -> Generator['KPipeline.Result', None, None]:
372
+ model = model or self.model
373
+ if model and voice is None:
374
+ raise ValueError('Specify a voice: en_us_pipeline(text="Hello world!", voice="af_heart")')
375
+ pack = self.load_voice(voice).to(model.device) if model else None
376
+
377
+ # Convert input to list of segments
378
+ if isinstance(text, str):
379
+ text = re.split(split_pattern, text.strip()) if split_pattern else [text]
380
+
381
+ # Process each segment
382
+ for graphemes_index, graphemes in enumerate(text):
383
+ if not graphemes.strip(): # Skip empty segments
384
+ continue
385
+
386
+ # English processing (unchanged)
387
+ if self.lang_code in 'ab':
388
+ logger.debug(f"Processing English text: {graphemes[:50]}{'...' if len(graphemes) > 50 else ''}")
389
+ _, tokens = self.g2p(graphemes)
390
+ for gs, ps, tks in self.en_tokenize(tokens):
391
+ if not ps:
392
+ continue
393
+ elif len(ps) > 510:
394
+ logger.warning(f"Unexpected len(ps) == {len(ps)} > 510 and ps == '{ps}'")
395
+ ps = ps[:510]
396
+ output = KPipeline.infer(model, ps, pack, speed) if model else None
397
+ if output is not None and output.pred_dur is not None:
398
+ KPipeline.join_timestamps(tks, output.pred_dur)
399
+ yield self.Result(graphemes=gs, phonemes=ps, tokens=tks, output=output, text_index=graphemes_index)
400
+
401
+ # Non-English processing with chunking
402
+ else:
403
+ # Split long text into smaller chunks (roughly 400 characters each)
404
+ # Using sentence boundaries when possible
405
+ chunk_size = 400
406
+ chunks = []
407
+
408
+ # Try to split on sentence boundaries first
409
+ sentences = re.split(r'([.!?]+)', graphemes)
410
+ current_chunk = ""
411
+
412
+ for i in range(0, len(sentences), 2):
413
+ sentence = sentences[i]
414
+ # Add the punctuation back if it exists
415
+ if i + 1 < len(sentences):
416
+ sentence += sentences[i + 1]
417
+
418
+ if len(current_chunk) + len(sentence) <= chunk_size:
419
+ current_chunk += sentence
420
+ else:
421
+ if current_chunk:
422
+ chunks.append(current_chunk.strip())
423
+ current_chunk = sentence
424
+
425
+ if current_chunk:
426
+ chunks.append(current_chunk.strip())
427
+
428
+ # If no chunks were created (no sentence boundaries), fall back to character-based chunking
429
+ if not chunks:
430
+ chunks = [graphemes[i:i+chunk_size] for i in range(0, len(graphemes), chunk_size)]
431
+
432
+ # Process each chunk
433
+ for chunk in chunks:
434
+ if not chunk.strip():
435
+ continue
436
+
437
+ ps, _ = self.g2p(chunk)
438
+ if not ps:
439
+ continue
440
+ elif len(ps) > 510:
441
+ logger.warning(f'Truncating len(ps) == {len(ps)} > 510')
442
+ ps = ps[:510]
443
+
444
+ output = KPipeline.infer(model, ps, pack, speed) if model else None
445
+ yield self.Result(graphemes=chunk, phonemes=ps, output=output, text_index=graphemes_index)
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ opencv-python>=4.9.0.80
2
+ diffusers>=0.31.0
3
+ transformers>=4.49.0
4
+ tokenizers>=0.20.3
5
+ accelerate>=1.1.1
6
+ tqdm
7
+ imageio
8
+ easydict
9
+ ftfy
10
+ dashscope
11
+ imageio-ffmpeg
12
+ scikit-image
13
+ loguru
14
+ gradio>=5.0.0
15
+ numpy>=1.23.5,<2
16
+ xfuser>=0.4.1
17
+ pyloudnorm
18
+ optimum-quanto==0.2.6
19
+ scenedetect
20
+ moviepy==1.0.3
src/audio_analysis/torch_utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+
5
+ def get_mask_from_lengths(lengths, max_len=None):
6
+ lengths = lengths.to(torch.long)
7
+ if max_len is None:
8
+ max_len = torch.max(lengths).item()
9
+
10
+ ids = torch.arange(0, max_len).unsqueeze(0).expand(lengths.shape[0], -1).to(lengths.device)
11
+ mask = ids < lengths.unsqueeze(1).expand(-1, max_len)
12
+
13
+ return mask
14
+
15
+
16
+ def linear_interpolation(features, seq_len):
17
+ features = features.transpose(1, 2)
18
+ output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
19
+ return output_features.transpose(1, 2)
20
+
src/audio_analysis/wav2vec2.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2Config, Wav2Vec2Model
2
+ from transformers.modeling_outputs import BaseModelOutput
3
+
4
+ from src.audio_analysis.torch_utils import linear_interpolation
5
+
6
+ # the implementation of Wav2Vec2Model is borrowed from
7
+ # https://github.com/huggingface/transformers/blob/HEAD/src/transformers/models/wav2vec2/modeling_wav2vec2.py
8
+ # initialize our encoder with the pre-trained wav2vec 2.0 weights.
9
+ class Wav2Vec2Model(Wav2Vec2Model):
10
+ def __init__(self, config: Wav2Vec2Config):
11
+ super().__init__(config)
12
+
13
+ def forward(
14
+ self,
15
+ input_values,
16
+ seq_len,
17
+ attention_mask=None,
18
+ mask_time_indices=None,
19
+ output_attentions=None,
20
+ output_hidden_states=None,
21
+ return_dict=None,
22
+ ):
23
+ self.config.output_attentions = True
24
+
25
+ output_hidden_states = (
26
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
27
+ )
28
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
29
+
30
+ extract_features = self.feature_extractor(input_values)
31
+ extract_features = extract_features.transpose(1, 2)
32
+ extract_features = linear_interpolation(extract_features, seq_len=seq_len)
33
+
34
+ if attention_mask is not None:
35
+ # compute reduced attention_mask corresponding to feature vectors
36
+ attention_mask = self._get_feature_vector_attention_mask(
37
+ extract_features.shape[1], attention_mask, add_adapter=False
38
+ )
39
+
40
+ hidden_states, extract_features = self.feature_projection(extract_features)
41
+ hidden_states = self._mask_hidden_states(
42
+ hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
43
+ )
44
+
45
+ encoder_outputs = self.encoder(
46
+ hidden_states,
47
+ attention_mask=attention_mask,
48
+ output_attentions=output_attentions,
49
+ output_hidden_states=output_hidden_states,
50
+ return_dict=return_dict,
51
+ )
52
+
53
+ hidden_states = encoder_outputs[0]
54
+
55
+ if self.adapter is not None:
56
+ hidden_states = self.adapter(hidden_states)
57
+
58
+ if not return_dict:
59
+ return (hidden_states, ) + encoder_outputs[1:]
60
+ return BaseModelOutput(
61
+ last_hidden_state=hidden_states,
62
+ hidden_states=encoder_outputs.hidden_states,
63
+ attentions=encoder_outputs.attentions,
64
+ )
65
+
66
+
67
+ def feature_extract(
68
+ self,
69
+ input_values,
70
+ seq_len,
71
+ ):
72
+ extract_features = self.feature_extractor(input_values)
73
+ extract_features = extract_features.transpose(1, 2)
74
+ extract_features = linear_interpolation(extract_features, seq_len=seq_len)
75
+
76
+ return extract_features
77
+
78
+ def encode(
79
+ self,
80
+ extract_features,
81
+ attention_mask=None,
82
+ mask_time_indices=None,
83
+ output_attentions=None,
84
+ output_hidden_states=None,
85
+ return_dict=None,
86
+ ):
87
+ self.config.output_attentions = True
88
+
89
+ output_hidden_states = (
90
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
91
+ )
92
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
93
+
94
+ if attention_mask is not None:
95
+ # compute reduced attention_mask corresponding to feature vectors
96
+ attention_mask = self._get_feature_vector_attention_mask(
97
+ extract_features.shape[1], attention_mask, add_adapter=False
98
+ )
99
+
100
+
101
+ hidden_states, extract_features = self.feature_projection(extract_features)
102
+ hidden_states = self._mask_hidden_states(
103
+ hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
104
+ )
105
+
106
+ encoder_outputs = self.encoder(
107
+ hidden_states,
108
+ attention_mask=attention_mask,
109
+ output_attentions=output_attentions,
110
+ output_hidden_states=output_hidden_states,
111
+ return_dict=return_dict,
112
+ )
113
+
114
+ hidden_states = encoder_outputs[0]
115
+
116
+ if self.adapter is not None:
117
+ hidden_states = self.adapter(hidden_states)
118
+
119
+ if not return_dict:
120
+ return (hidden_states, ) + encoder_outputs[1:]
121
+ return BaseModelOutput(
122
+ last_hidden_state=hidden_states,
123
+ hidden_states=encoder_outputs.hidden_states,
124
+ attentions=encoder_outputs.attentions,
125
+ )
src/utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import contextmanager
2
+
3
+ import torch
4
+
5
+ @contextmanager
6
+ def init_weights_on_device(device=torch.device("meta"), include_buffers: bool = False):
7
+ old_register_parameter = torch.nn.Module.register_parameter
8
+ if include_buffers:
9
+ old_register_buffer = torch.nn.Module.register_buffer
10
+
11
+ def register_empty_parameter(module, name, param):
12
+ old_register_parameter(module, name, param)
13
+ if param is not None:
14
+ param_cls = type(module._parameters[name])
15
+ kwargs = module._parameters[name].__dict__
16
+ kwargs["requires_grad"] = param.requires_grad
17
+ module._parameters[name] = param_cls(
18
+ module._parameters[name].to(device), **kwargs
19
+ )
20
+
21
+ def register_empty_buffer(module, name, buffer, persistent=True):
22
+ old_register_buffer(module, name, buffer, persistent=persistent)
23
+ if buffer is not None:
24
+ module._buffers[name] = module._buffers[name].to(device)
25
+
26
+ def patch_tensor_constructor(fn):
27
+ def wrapper(*args, **kwargs):
28
+ kwargs["device"] = device
29
+ return fn(*args, **kwargs)
30
+
31
+ return wrapper
32
+
33
+ if include_buffers:
34
+ tensor_constructors_to_patch = {
35
+ torch_function_name: getattr(torch, torch_function_name)
36
+ for torch_function_name in ["empty", "zeros", "ones", "full"]
37
+ }
38
+ else:
39
+ tensor_constructors_to_patch = {}
40
+
41
+ try:
42
+ torch.nn.Module.register_parameter = register_empty_parameter
43
+ if include_buffers:
44
+ torch.nn.Module.register_buffer = register_empty_buffer
45
+ for torch_function_name in tensor_constructors_to_patch.keys():
46
+ setattr(
47
+ torch,
48
+ torch_function_name,
49
+ patch_tensor_constructor(getattr(torch, torch_function_name)),
50
+ )
51
+ yield
52
+ finally:
53
+ torch.nn.Module.register_parameter = old_register_parameter
54
+ if include_buffers:
55
+ torch.nn.Module.register_buffer = old_register_buffer
56
+ for (
57
+ torch_function_name,
58
+ old_torch_function,
59
+ ) in tensor_constructors_to_patch.items():
60
+ setattr(torch, torch_function_name, old_torch_function)
src/vram_management/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .layers import *
src/vram_management/layers.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ import torch
4
+
5
+ from src.utils import init_weights_on_device
6
+ import optimum.quanto.nn.qlinear as qlinear
7
+
8
+ def cast_to(weight, dtype, device):
9
+ r = torch.empty_like(weight, dtype=dtype, device=device)
10
+ r.copy_(weight)
11
+ return r
12
+
13
+ def cast_to_device(weight, device):
14
+ if hasattr(weight, '__class__') and 'optimum.quanto' in str(weight.__class__):
15
+ return weight.to(device)
16
+ else:
17
+ r = torch.empty_like(weight, device=device)
18
+ r.copy_(weight)
19
+ return r
20
+
21
+ class AutoWrappedModule(torch.nn.Module):
22
+ def __init__(
23
+ self,
24
+ module: torch.nn.Module,
25
+ offload_dtype,
26
+ offload_device,
27
+ onload_dtype,
28
+ onload_device,
29
+ computation_dtype,
30
+ computation_device,
31
+ ):
32
+ super().__init__()
33
+ self.module = module.to(dtype=offload_dtype, device=offload_device)
34
+ self.offload_dtype = offload_dtype
35
+ self.offload_device = offload_device
36
+ self.onload_dtype = onload_dtype
37
+ self.onload_device = onload_device
38
+ self.computation_dtype = computation_dtype
39
+ self.computation_device = computation_device
40
+ self.state = 0
41
+
42
+ def offload(self):
43
+ if self.state == 1 and (
44
+ self.offload_dtype != self.onload_dtype
45
+ or self.offload_device != self.onload_device
46
+ ):
47
+ self.module.to(dtype=self.offload_dtype, device=self.offload_device)
48
+ self.state = 0
49
+
50
+ def onload(self):
51
+ if self.state == 0 and (
52
+ self.offload_dtype != self.onload_dtype
53
+ or self.offload_device != self.onload_device
54
+ ):
55
+ self.module.to(dtype=self.onload_dtype, device=self.onload_device)
56
+ self.state = 1
57
+
58
+ def forward(self, *args, **kwargs):
59
+ if (
60
+ self.onload_dtype == self.computation_dtype
61
+ and self.onload_device == self.computation_device
62
+ ):
63
+ module = self.module
64
+ else:
65
+ module = copy.deepcopy(self.module).to(
66
+ dtype=self.computation_dtype, device=self.computation_device
67
+ )
68
+ return module(*args, **kwargs)
69
+
70
+
71
+
72
+ class AutoWrappedQLinear(qlinear.QLinear):
73
+ def __init__(
74
+ self,
75
+ module: qlinear.QLinear,
76
+ offload_dtype,
77
+ offload_device,
78
+ onload_dtype,
79
+ onload_device,
80
+ computation_dtype,
81
+ computation_device,
82
+ ):
83
+ with init_weights_on_device(device=torch.device("meta")):
84
+ super().__init__(
85
+ in_features=module.in_features,
86
+ out_features=module.out_features,
87
+ bias=module.bias is not None,
88
+ device=offload_device,
89
+ )
90
+ self.weight = module.weight
91
+ self.bias = module.bias
92
+ self.offload_device = offload_device
93
+
94
+ self.onload_device = onload_device
95
+ self.computation_device = computation_device
96
+ self.state = 0
97
+
98
+ def offload(self):
99
+ if self.state == 1 and (
100
+ self.offload_device != self.onload_device
101
+ ):
102
+ self.to(device=self.offload_device)
103
+ self.state = 0
104
+
105
+ def onload(self):
106
+ if self.state == 0 and (
107
+ self.offload_device != self.onload_device
108
+ ):
109
+ self.to(device=self.onload_device)
110
+ self.state = 1
111
+
112
+ def forward(self, x, *args, **kwargs):
113
+ if (
114
+ self.onload_device == self.computation_device
115
+ ):
116
+
117
+ return torch.nn.functional.linear(x, self.weight, bias=self.bias)
118
+ else:
119
+
120
+ qweight = cast_to_device(self.weight, self.computation_device)
121
+ bias = (
122
+ None
123
+ if self.bias is None
124
+ else cast_to_device(self.bias, self.computation_device)
125
+ )
126
+ return torch.nn.functional.linear(x, qweight, bias)
127
+
128
+ class AutoWrappedLinear(torch.nn.Linear):
129
+ def __init__(
130
+ self,
131
+ module: torch.nn.Linear,
132
+ offload_dtype,
133
+ offload_device,
134
+ onload_dtype,
135
+ onload_device,
136
+ computation_dtype,
137
+ computation_device,
138
+ ):
139
+ with init_weights_on_device(device=torch.device("meta")):
140
+ super().__init__(
141
+ in_features=module.in_features,
142
+ out_features=module.out_features,
143
+ bias=module.bias is not None,
144
+ dtype=offload_dtype,
145
+ device=offload_device,
146
+ )
147
+ self.weight = module.weight
148
+ self.bias = module.bias
149
+ self.offload_dtype = offload_dtype
150
+ self.offload_device = offload_device
151
+ self.onload_dtype = onload_dtype
152
+ self.onload_device = onload_device
153
+ self.computation_dtype = computation_dtype
154
+ self.computation_device = computation_device
155
+ self.state = 0
156
+
157
+ def offload(self):
158
+ if self.state == 1 and (
159
+ self.offload_dtype != self.onload_dtype
160
+ or self.offload_device != self.onload_device
161
+ ):
162
+ self.to(dtype=self.offload_dtype, device=self.offload_device)
163
+ self.state = 0
164
+
165
+ def onload(self):
166
+ if self.state == 0 and (
167
+ self.offload_dtype != self.onload_dtype
168
+ or self.offload_device != self.onload_device
169
+ ):
170
+ self.to(dtype=self.onload_dtype, device=self.onload_device)
171
+ self.state = 1
172
+
173
+ def forward(self, x, *args, **kwargs):
174
+ if (
175
+ self.onload_dtype == self.computation_dtype
176
+ and self.onload_device == self.computation_device
177
+ ):
178
+ weight, bias = self.weight, self.bias
179
+ else:
180
+ weight = cast_to(
181
+ self.weight, self.computation_dtype, self.computation_device
182
+ )
183
+ bias = (
184
+ None
185
+ if self.bias is None
186
+ else cast_to(self.bias, self.computation_dtype, self.computation_device)
187
+ )
188
+ return torch.nn.functional.linear(x, weight, bias)
189
+
190
+
191
+ def enable_vram_management_recursively(
192
+ model: torch.nn.Module,
193
+ module_map: dict,
194
+ module_config: dict,
195
+ max_num_param=None,
196
+ overflow_module_config: dict = None,
197
+ total_num_param=0,
198
+ ):
199
+ for name, module in model.named_children():
200
+ for source_module, target_module in module_map.items():
201
+ if isinstance(module, source_module):
202
+ num_param = sum(p.numel() for p in module.parameters())
203
+ # print(str(module) + ':' + str(num_param))
204
+ if (
205
+ max_num_param is not None
206
+ and total_num_param + num_param > max_num_param
207
+ ):
208
+ # print(str(module) + '-->\t\t num:' + str(num_param) + "\t total:" + str(total_num_param))
209
+ module_config_ = overflow_module_config
210
+ else:
211
+ module_config_ = module_config
212
+ module_ = target_module(module, **module_config_)
213
+ setattr(model, name, module_)
214
+ total_num_param += num_param
215
+ break
216
+ else:
217
+ total_num_param = enable_vram_management_recursively(
218
+ module,
219
+ module_map,
220
+ module_config,
221
+ max_num_param,
222
+ overflow_module_config,
223
+ total_num_param,
224
+ )
225
+ return total_num_param
226
+
227
+
228
+ def enable_vram_management(
229
+ model: torch.nn.Module,
230
+ module_map: dict,
231
+ module_config: dict,
232
+ max_num_param=None,
233
+ overflow_module_config: dict = None,
234
+ ):
235
+ enable_vram_management_recursively(
236
+ model,
237
+ module_map,
238
+ module_config,
239
+ max_num_param,
240
+ overflow_module_config,
241
+ total_num_param=0,
242
+ )
243
+ model.vram_management_enabled = True
wan/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from . import configs, distributed, modules
2
+ from .first_last_frame2video import WanFLF2V
3
+ from .image2video import WanI2V
4
+ from .text2video import WanT2V
5
+ from .vace import WanVace, WanVaceMP
6
+ from .multitalk import InfiniteTalkPipeline
wan/configs/__init__.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import copy
3
+ import os
4
+
5
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
6
+
7
+ from .wan_i2v_14B import i2v_14B
8
+ from .wan_t2v_1_3B import t2v_1_3B
9
+ from .wan_t2v_14B import t2v_14B
10
+ from .wan_multitalk_14B import multitalk_14B
11
+
12
+ # the config of t2i_14B is the same as t2v_14B
13
+ t2i_14B = copy.deepcopy(t2v_14B)
14
+ t2i_14B.__name__ = 'Config: Wan T2I 14B'
15
+
16
+ # the config of flf2v_14B is the same as i2v_14B
17
+ flf2v_14B = copy.deepcopy(i2v_14B)
18
+ flf2v_14B.__name__ = 'Config: Wan FLF2V 14B'
19
+ flf2v_14B.sample_neg_prompt = "镜头切换," + flf2v_14B.sample_neg_prompt
20
+
21
+ WAN_CONFIGS = {
22
+ 't2v-14B': t2v_14B,
23
+ 't2v-1.3B': t2v_1_3B,
24
+ 'i2v-14B': i2v_14B,
25
+ 't2i-14B': t2i_14B,
26
+ 'flf2v-14B': flf2v_14B,
27
+ 'vace-1.3B': t2v_1_3B,
28
+ 'vace-14B': t2v_14B,
29
+ 'infinitetalk-14B': multitalk_14B,
30
+ }
31
+
32
+ SIZE_CONFIGS = {
33
+ '720*1280': (720, 1280),
34
+ '1280*720': (1280, 720),
35
+ '480*832': (480, 832),
36
+ '832*480': (832, 480),
37
+ '1024*1024': (1024, 1024),
38
+ 'infinitetalk-480': (640, 640),
39
+ 'infinitetalk-720': (960, 960),
40
+ }
41
+
42
+ MAX_AREA_CONFIGS = {
43
+ '720*1280': 720 * 1280,
44
+ '1280*720': 1280 * 720,
45
+ '480*832': 480 * 832,
46
+ '832*480': 832 * 480,
47
+ }
48
+
49
+ SUPPORTED_SIZES = {
50
+ 't2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
51
+ 't2v-1.3B': ('480*832', '832*480'),
52
+ 'i2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
53
+ 'flf2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
54
+ 't2i-14B': tuple(SIZE_CONFIGS.keys()),
55
+ 'vace-1.3B': ('480*832', '832*480'),
56
+ 'vace-14B': ('720*1280', '1280*720', '480*832', '832*480'),
57
+ 'infinitetalk-14B': ('infinitetalk-480', 'infinitetalk-720'),
58
+ }
wan/configs/shared_config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import torch
3
+ from easydict import EasyDict
4
+
5
+ #------------------------ Wan shared config ------------------------#
6
+ wan_shared_cfg = EasyDict()
7
+
8
+ # t5
9
+ wan_shared_cfg.t5_model = 'umt5_xxl'
10
+ wan_shared_cfg.t5_dtype = torch.bfloat16
11
+ wan_shared_cfg.text_len = 512
12
+
13
+ # transformer
14
+ wan_shared_cfg.param_dtype = torch.bfloat16
15
+
16
+ # inference
17
+ wan_shared_cfg.num_train_timesteps = 1000
18
+ wan_shared_cfg.sample_fps = 16
19
+ wan_shared_cfg.sample_neg_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
wan/configs/wan_i2v_14B.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import torch
3
+ from easydict import EasyDict
4
+
5
+ from .shared_config import wan_shared_cfg
6
+
7
+ #------------------------ Wan I2V 14B ------------------------#
8
+
9
+ i2v_14B = EasyDict(__name__='Config: Wan I2V 14B')
10
+ i2v_14B.update(wan_shared_cfg)
11
+ i2v_14B.sample_neg_prompt = "镜头晃动," + i2v_14B.sample_neg_prompt
12
+
13
+ i2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
14
+ i2v_14B.t5_tokenizer = 'google/umt5-xxl'
15
+
16
+ # clip
17
+ i2v_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
18
+ i2v_14B.clip_dtype = torch.float16
19
+ i2v_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
20
+ i2v_14B.clip_tokenizer = 'xlm-roberta-large'
21
+
22
+ # vae
23
+ i2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
24
+ i2v_14B.vae_stride = (4, 8, 8)
wan/configs/wan_multitalk_14B.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import torch
3
+ from easydict import EasyDict
4
+
5
+ from .shared_config import wan_shared_cfg
6
+
7
+ #------------------------ Wan I2V 14B ------------------------#
8
+
9
+ multitalk_14B = EasyDict(__name__='Config: Wan MultiTalk AI2V 14B')
10
+ multitalk_14B.update(wan_shared_cfg)
11
+ multitalk_14B.sample_neg_prompt = 'bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards'
12
+
13
+ multitalk_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
14
+ multitalk_14B.t5_tokenizer = 'google/umt5-xxl'
15
+
16
+ # clip
17
+ multitalk_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
18
+ multitalk_14B.clip_dtype = torch.float16
19
+ multitalk_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
20
+ multitalk_14B.clip_tokenizer = 'xlm-roberta-large'
21
+
22
+ # vae
23
+ multitalk_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
24
+ multitalk_14B.vae_stride = (4, 8, 8)
25
+
26
+ # transformer
27
+ multitalk_14B.patch_size = (1, 2, 2)
28
+ multitalk_14B.dim = 5120
29
+ multitalk_14B.ffn_dim = 13824
30
+ multitalk_14B.freq_dim = 256
31
+ multitalk_14B.num_heads = 40
32
+ multitalk_14B.num_layers = 40
33
+ multitalk_14B.window_size = (-1, -1)
34
+ multitalk_14B.qk_norm = True
35
+ multitalk_14B.cross_attn_norm = True
36
+ multitalk_14B.eps = 1e-6
wan/configs/wan_t2v_14B.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ from easydict import EasyDict
3
+
4
+ from .shared_config import wan_shared_cfg
5
+
6
+ #------------------------ Wan T2V 14B ------------------------#
7
+
8
+ t2v_14B = EasyDict(__name__='Config: Wan T2V 14B')
9
+ t2v_14B.update(wan_shared_cfg)
10
+
11
+ # t5
12
+ t2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
13
+ t2v_14B.t5_tokenizer = 'google/umt5-xxl'
14
+
15
+ # vae
16
+ t2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
17
+ t2v_14B.vae_stride = (4, 8, 8)
18
+
19
+ # transformer
20
+ t2v_14B.patch_size = (1, 2, 2)
21
+ t2v_14B.dim = 5120
22
+ t2v_14B.ffn_dim = 13824
23
+ t2v_14B.freq_dim = 256
24
+ t2v_14B.num_heads = 40
25
+ t2v_14B.num_layers = 40
26
+ t2v_14B.window_size = (-1, -1)
27
+ t2v_14B.qk_norm = True
28
+ t2v_14B.cross_attn_norm = True
29
+ t2v_14B.eps = 1e-6
wan/configs/wan_t2v_1_3B.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ from easydict import EasyDict
3
+
4
+ from .shared_config import wan_shared_cfg
5
+
6
+ #------------------------ Wan T2V 1.3B ------------------------#
7
+
8
+ t2v_1_3B = EasyDict(__name__='Config: Wan T2V 1.3B')
9
+ t2v_1_3B.update(wan_shared_cfg)
10
+
11
+ # t5
12
+ t2v_1_3B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
13
+ t2v_1_3B.t5_tokenizer = 'google/umt5-xxl'
14
+
15
+ # vae
16
+ t2v_1_3B.vae_checkpoint = 'Wan2.1_VAE.pth'
17
+ t2v_1_3B.vae_stride = (4, 8, 8)
18
+
19
+ # transformer
20
+ t2v_1_3B.patch_size = (1, 2, 2)
21
+ t2v_1_3B.dim = 1536
22
+ t2v_1_3B.ffn_dim = 8960
23
+ t2v_1_3B.freq_dim = 256
24
+ t2v_1_3B.num_heads = 12
25
+ t2v_1_3B.num_layers = 30
26
+ t2v_1_3B.window_size = (-1, -1)
27
+ t2v_1_3B.qk_norm = True
28
+ t2v_1_3B.cross_attn_norm = True
29
+ t2v_1_3B.eps = 1e-6
wan/distributed/__init__.py ADDED
File without changes
wan/distributed/fsdp.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import gc
3
+ from functools import partial
4
+
5
+ import torch
6
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
7
+ from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
8
+ from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
9
+ from torch.distributed.utils import _free_storage
10
+
11
+
12
+ def shard_model(
13
+ model,
14
+ device_id,
15
+ param_dtype=torch.bfloat16,
16
+ reduce_dtype=torch.float32,
17
+ buffer_dtype=torch.float32,
18
+ process_group=None,
19
+ sharding_strategy=ShardingStrategy.FULL_SHARD,
20
+ sync_module_states=True,
21
+ ):
22
+ model = FSDP(
23
+ module=model,
24
+ process_group=process_group,
25
+ sharding_strategy=sharding_strategy,
26
+ auto_wrap_policy=partial(
27
+ lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.blocks),
28
+ # mixed_precision=MixedPrecision(
29
+ # param_dtype=param_dtype,
30
+ # reduce_dtype=reduce_dtype,
31
+ # buffer_dtype=buffer_dtype),
32
+ device_id=device_id,
33
+ sync_module_states=sync_module_states)
34
+ return model
35
+
36
+
37
+ def free_model(model):
38
+ for m in model.modules():
39
+ if isinstance(m, FSDP):
40
+ _free_storage(m._handle.flat_param.data)
41
+ del model
42
+ gc.collect()
43
+ torch.cuda.empty_cache()
wan/distributed/xdit_context_parallel.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.cuda.amp as amp
6
+ from xfuser.core.distributed import (
7
+ get_sequence_parallel_rank,
8
+ get_sequence_parallel_world_size,
9
+ get_sp_group,
10
+ )
11
+ from einops import rearrange
12
+ from xfuser.core.long_ctx_attention import xFuserLongContextAttention
13
+ import xformers.ops
14
+
15
+ from ..modules.model import sinusoidal_embedding_1d
16
+ from ..utils.multitalk_utils import get_attn_map_with_target, split_token_counts_and_frame_ids, normalize_and_scale
17
+ from ..modules.attention import SingleStreamAttention, SingleStreamMutiAttention
18
+
19
+
20
+ def pad_freqs(original_tensor, target_len):
21
+ seq_len, s1, s2 = original_tensor.shape
22
+ pad_size = target_len - seq_len
23
+ padding_tensor = torch.ones(
24
+ pad_size,
25
+ s1,
26
+ s2,
27
+ dtype=original_tensor.dtype,
28
+ device=original_tensor.device)
29
+ padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
30
+ return padded_tensor
31
+
32
+
33
+ @amp.autocast(enabled=False)
34
+ def rope_apply(x, grid_sizes, freqs):
35
+ """
36
+ x: [B, L, N, C].
37
+ grid_sizes: [B, 3].
38
+ freqs: [M, C // 2].
39
+ """
40
+ s, n, c = x.size(1), x.size(2), x.size(3) // 2
41
+ # split freqs
42
+ freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1) # [[N, head_dim/2], [N, head_dim/2], [N, head_dim/2]] # T H W 极坐标
43
+
44
+ # loop over samples
45
+ output = []
46
+ for i, (f, h, w) in enumerate(grid_sizes.tolist()):
47
+ seq_len = f * h * w
48
+
49
+ # precompute multipliers
50
+ x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
51
+ s, n, -1, 2)) # [L, N, C/2] # 极坐标
52
+ freqs_i = torch.cat([
53
+ freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
54
+ freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
55
+ freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
56
+ ],
57
+ dim=-1).reshape(seq_len, 1, -1) # seq_lens, 1, 3 * dim / 2 (T H W)
58
+
59
+ # apply rotary embedding
60
+ sp_size = get_sequence_parallel_world_size()
61
+ sp_rank = get_sequence_parallel_rank()
62
+ freqs_i = pad_freqs(freqs_i, s * sp_size)
63
+ s_per_rank = s
64
+ freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
65
+ s_per_rank), :, :]
66
+ x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
67
+ x_i = torch.cat([x_i, x[i, s:]])
68
+
69
+ # append to collection
70
+ output.append(x_i)
71
+ return torch.stack(output).float()
72
+
73
+
74
+ def usp_dit_forward_vace(self, x, vace_context, seq_len, kwargs):
75
+ # embeddings
76
+ c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
77
+ c = [u.flatten(2).transpose(1, 2) for u in c]
78
+ c = torch.cat([
79
+ torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
80
+ for u in c
81
+ ])
82
+
83
+ # arguments
84
+ new_kwargs = dict(x=x)
85
+ new_kwargs.update(kwargs)
86
+
87
+ # Context Parallel
88
+ c = torch.chunk(
89
+ c, get_sequence_parallel_world_size(),
90
+ dim=1)[get_sequence_parallel_rank()]
91
+
92
+ hints = []
93
+ for block in self.vace_blocks:
94
+ c, c_skip = block(c, **new_kwargs)
95
+ hints.append(c_skip)
96
+ return hints
97
+
98
+
99
+ def usp_dit_forward(
100
+ self,
101
+ x,
102
+ t,
103
+ context,
104
+ seq_len,
105
+ vace_context=None,
106
+ vace_context_scale=1.0,
107
+ clip_fea=None,
108
+ y=None,
109
+ ):
110
+ """
111
+ x: A list of videos each with shape [C, T, H, W].
112
+ t: [B].
113
+ context: A list of text embeddings each with shape [L, C].
114
+ """
115
+ if self.model_type == 'i2v':
116
+ assert clip_fea is not None and y is not None
117
+ # params
118
+ device = self.patch_embedding.weight.device
119
+ if self.freqs.device != device:
120
+ self.freqs = self.freqs.to(device)
121
+
122
+ if self.model_type != 'vace' and y is not None:
123
+ x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
124
+
125
+ # embeddings
126
+ x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
127
+ grid_sizes = torch.stack(
128
+ [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
129
+ x = [u.flatten(2).transpose(1, 2) for u in x]
130
+ seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
131
+ assert seq_lens.max() <= seq_len
132
+ x = torch.cat([
133
+ torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
134
+ for u in x
135
+ ])
136
+
137
+ # time embeddings
138
+ with amp.autocast(dtype=torch.float32):
139
+ e = self.time_embedding(
140
+ sinusoidal_embedding_1d(self.freq_dim, t).float())
141
+ e0 = self.time_projection(e).unflatten(1, (6, self.dim))
142
+ assert e.dtype == torch.float32 and e0.dtype == torch.float32
143
+
144
+ # context
145
+ context_lens = None
146
+ context = self.text_embedding(
147
+ torch.stack([
148
+ torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
149
+ for u in context
150
+ ]))
151
+
152
+ if self.model_type != 'vace' and clip_fea is not None:
153
+ context_clip = self.img_emb(clip_fea) # bs x 257 x dim
154
+ context = torch.concat([context_clip, context], dim=1)
155
+
156
+ # arguments
157
+ kwargs = dict(
158
+ e=e0,
159
+ seq_lens=seq_lens,
160
+ grid_sizes=grid_sizes,
161
+ freqs=self.freqs,
162
+ context=context,
163
+ context_lens=context_lens)
164
+
165
+ # Context Parallel
166
+ x = torch.chunk(
167
+ x, get_sequence_parallel_world_size(),
168
+ dim=1)[get_sequence_parallel_rank()]
169
+
170
+ for block in self.blocks:
171
+ x = block(x, **kwargs)
172
+
173
+ # head
174
+ x = self.head(x, e)
175
+
176
+ # Context Parallel
177
+ x = get_sp_group().all_gather(x, dim=1)
178
+
179
+ # unpatchify
180
+ x = self.unpatchify(x, grid_sizes)
181
+ return [u.float() for u in x]
182
+
183
+
184
+ def usp_attn_forward(self,
185
+ x,
186
+ seq_lens,
187
+ grid_sizes,
188
+ freqs,
189
+ dtype=torch.bfloat16):
190
+ b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
191
+ half_dtypes = (torch.float16, torch.bfloat16)
192
+
193
+ def half(x):
194
+ return x if x.dtype in half_dtypes else x.to(dtype)
195
+
196
+ # query, key, value function
197
+ def qkv_fn(x):
198
+ q = self.norm_q(self.q(x)).view(b, s, n, d)
199
+ k = self.norm_k(self.k(x)).view(b, s, n, d)
200
+ v = self.v(x).view(b, s, n, d)
201
+ return q, k, v
202
+
203
+ q, k, v = qkv_fn(x)
204
+ q = rope_apply(q, grid_sizes, freqs)
205
+ k = rope_apply(k, grid_sizes, freqs)
206
+
207
+ # TODO: We should use unpaded q,k,v for attention.
208
+ # k_lens = seq_lens // get_sequence_parallel_world_size()
209
+ # if k_lens is not None:
210
+ # q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
211
+ # k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
212
+ # v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
213
+
214
+ x = xFuserLongContextAttention()(
215
+ None,
216
+ query=half(q),
217
+ key=half(k),
218
+ value=half(v),
219
+ window_size=self.window_size)
220
+
221
+ # TODO: padding after attention.
222
+ # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
223
+
224
+ # output
225
+ x = x.flatten(2)
226
+ x = self.o(x)
227
+ return x
228
+
229
+
230
+
231
+
232
+ def usp_dit_forward_multitalk(
233
+ self,
234
+ x,
235
+ t,
236
+ context,
237
+ seq_len,
238
+ clip_fea=None,
239
+ y=None,
240
+ audio=None,
241
+ ref_target_masks=None,
242
+ ):
243
+ """
244
+ x: A list of videos each with shape [C, T, H, W].
245
+ t: [B].
246
+ context: A list of text embeddings each with shape [L, C].
247
+ """
248
+
249
+ assert clip_fea is not None and y is not None
250
+ # params
251
+ device = self.patch_embedding.weight.device
252
+ if self.freqs.device != device:
253
+ self.freqs = self.freqs.to(device)
254
+
255
+ _, T, H, W = x[0].shape
256
+ N_t = T // self.patch_size[0]
257
+ N_h = H // self.patch_size[1]
258
+ N_w = W // self.patch_size[2]
259
+
260
+ if y is not None:
261
+ x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
262
+ x[0] = x[0].to(context[0].dtype)
263
+
264
+ # embeddings
265
+ x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
266
+ grid_sizes = torch.stack(
267
+ [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
268
+ x = [u.flatten(2).transpose(1, 2) for u in x]
269
+ seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
270
+ assert seq_lens.max() <= seq_len
271
+ x = torch.cat([
272
+ torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
273
+ for u in x
274
+ ])
275
+
276
+ # time embeddings
277
+ with amp.autocast(dtype=torch.float32):
278
+ e = self.time_embedding(
279
+ sinusoidal_embedding_1d(self.freq_dim, t).float())
280
+ e0 = self.time_projection(e).unflatten(1, (6, self.dim))
281
+ assert e.dtype == torch.float32 and e0.dtype == torch.float32
282
+
283
+ # context
284
+ context_lens = None
285
+ context = self.text_embedding(
286
+ torch.stack([
287
+ torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
288
+ for u in context
289
+ ]))
290
+
291
+ if clip_fea is not None:
292
+ context_clip = self.img_emb(clip_fea)
293
+ context = torch.concat([context_clip, context], dim=1)
294
+
295
+ # get audio token
296
+ audio_cond = audio.to(device=x.device, dtype=x.dtype)
297
+ first_frame_audio_emb_s = audio_cond[:, :1, ...]
298
+ latter_frame_audio_emb = audio_cond[:, 1:, ...]
299
+ latter_frame_audio_emb = rearrange(latter_frame_audio_emb, "b (n_t n) w s c -> b n_t n w s c", n=self.vae_scale)
300
+ middle_index = self.audio_window // 2
301
+ latter_first_frame_audio_emb = latter_frame_audio_emb[:, :, :1, :middle_index+1, ...]
302
+ latter_first_frame_audio_emb = rearrange(latter_first_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
303
+ latter_last_frame_audio_emb = latter_frame_audio_emb[:, :, -1:, middle_index:, ...]
304
+ latter_last_frame_audio_emb = rearrange(latter_last_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
305
+ latter_middle_frame_audio_emb = latter_frame_audio_emb[:, :, 1:-1, middle_index:middle_index+1, ...]
306
+ latter_middle_frame_audio_emb = rearrange(latter_middle_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
307
+ latter_frame_audio_emb_s = torch.concat([latter_first_frame_audio_emb, latter_middle_frame_audio_emb, latter_last_frame_audio_emb], dim=2)
308
+ audio_embedding = self.audio_proj(first_frame_audio_emb_s, latter_frame_audio_emb_s)
309
+ human_num = len(audio_embedding)
310
+ audio_embedding = torch.concat(audio_embedding.split(1), dim=2).to(x.dtype)
311
+
312
+
313
+ # convert ref_target_masks to token_ref_target_masks
314
+ if ref_target_masks is not None:
315
+ ref_target_masks = ref_target_masks.unsqueeze(0).to(torch.float32)
316
+ token_ref_target_masks = nn.functional.interpolate(ref_target_masks, size=(N_h, N_w), mode='nearest')
317
+ token_ref_target_masks = token_ref_target_masks.squeeze(0)
318
+ token_ref_target_masks = (token_ref_target_masks > 0)
319
+ token_ref_target_masks = token_ref_target_masks.view(token_ref_target_masks.shape[0], -1)
320
+ token_ref_target_masks = token_ref_target_masks.to(x.dtype)
321
+
322
+ if self.enable_teacache:
323
+ modulated_inp = e0 if self.use_ret_steps else e
324
+ if self.cnt%3==0: # cond
325
+ if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
326
+ should_calc_cond = True
327
+ self.accumulated_rel_l1_distance_cond = 0
328
+ else:
329
+ rescale_func = np.poly1d(self.coefficients)
330
+ self.accumulated_rel_l1_distance_cond += rescale_func(((modulated_inp-self.previous_e0_cond).abs().mean() / self.previous_e0_cond.abs().mean()).cpu().item())
331
+ # print("accumulated_rel_l1_distance_even", self.accumulated_rel_l1_distance_even)
332
+ if self.accumulated_rel_l1_distance_cond < self.teacache_thresh:
333
+ should_calc_cond = False
334
+ else:
335
+ should_calc_cond = True
336
+ self.accumulated_rel_l1_distance_cond = 0
337
+ self.previous_e0_cond = modulated_inp.clone()
338
+ elif self.cnt%3==1: # drop_text
339
+ if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
340
+ should_calc_drop_text = True
341
+ self.accumulated_rel_l1_distance_drop_text = 0
342
+ else:
343
+ rescale_func = np.poly1d(self.coefficients)
344
+ self.accumulated_rel_l1_distance_drop_text += rescale_func(((modulated_inp-self.previous_e0_drop_text).abs().mean() / self.previous_e0_drop_text.abs().mean()).cpu().item())
345
+ if self.accumulated_rel_l1_distance_drop_text < self.teacache_thresh:
346
+ should_calc_drop_text = False
347
+ else:
348
+ should_calc_drop_text = True
349
+ self.accumulated_rel_l1_distance_drop_text = 0
350
+ self.previous_e0_drop_text = modulated_inp.clone()
351
+ else: # uncond
352
+ if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
353
+ should_calc_uncond = True
354
+ self.accumulated_rel_l1_distance_uncond = 0
355
+ else:
356
+ rescale_func = np.poly1d(self.coefficients)
357
+ self.accumulated_rel_l1_distance_uncond += rescale_func(((modulated_inp-self.previous_e0_uncond).abs().mean() / self.previous_e0_uncond.abs().mean()).cpu().item())
358
+ if self.accumulated_rel_l1_distance_uncond < self.teacache_thresh:
359
+ should_calc_uncond = False
360
+ else:
361
+ should_calc_uncond = True
362
+ self.accumulated_rel_l1_distance_uncond = 0
363
+ self.previous_e0_uncond = modulated_inp.clone()
364
+
365
+ # Context Parallel
366
+ x = torch.chunk(
367
+ x, get_sequence_parallel_world_size(),
368
+ dim=1)[get_sequence_parallel_rank()]
369
+
370
+ # arguments
371
+ kwargs = dict(
372
+ e=e0,
373
+ seq_lens=seq_lens,
374
+ grid_sizes=grid_sizes,
375
+ freqs=self.freqs,
376
+ context=context,
377
+ context_lens=context_lens,
378
+ audio_embedding=audio_embedding,
379
+ ref_target_masks=token_ref_target_masks,
380
+ human_num=human_num,
381
+ )
382
+
383
+ if self.enable_teacache:
384
+ if self.cnt%3==0:
385
+ if not should_calc_cond:
386
+ x += self.previous_residual_cond
387
+ else:
388
+ ori_x = x.clone()
389
+ for block in self.blocks:
390
+ x = block(x, **kwargs)
391
+ self.previous_residual_cond = x - ori_x
392
+ elif self.cnt%3==1:
393
+ if not should_calc_drop_text:
394
+ x += self.previous_residual_drop_text
395
+ else:
396
+ ori_x = x.clone()
397
+ for block in self.blocks:
398
+ x = block(x, **kwargs)
399
+ self.previous_residual_drop_text = x - ori_x
400
+ else:
401
+ if not should_calc_uncond:
402
+ x += self.previous_residual_uncond
403
+ else:
404
+ ori_x = x.clone()
405
+ for block in self.blocks:
406
+ x = block(x, **kwargs)
407
+ self.previous_residual_uncond = x - ori_x
408
+ else:
409
+ for block in self.blocks:
410
+ x = block(x, **kwargs)
411
+
412
+ # head
413
+ x = self.head(x, e)
414
+
415
+ # Context Parallel
416
+ x = get_sp_group().all_gather(x, dim=1)
417
+
418
+ # unpatchify
419
+ x = self.unpatchify(x, grid_sizes)
420
+ if self.enable_teacache:
421
+ self.cnt += 1
422
+ if self.cnt >= self.num_steps:
423
+ self.cnt = 0
424
+
425
+ return torch.stack(x).float()
426
+
427
+
428
+ def usp_attn_forward_multitalk(self,
429
+ x,
430
+ seq_lens,
431
+ grid_sizes,
432
+ freqs,
433
+ dtype=torch.bfloat16,
434
+ ref_target_masks=None):
435
+ b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
436
+ half_dtypes = (torch.float16, torch.bfloat16)
437
+
438
+ def half(x):
439
+ return x if x.dtype in half_dtypes else x.to(dtype)
440
+
441
+ # query, key, value function
442
+ def qkv_fn(x):
443
+ q = self.norm_q(self.q(x)).view(b, s, n, d)
444
+ k = self.norm_k(self.k(x)).view(b, s, n, d)
445
+ v = self.v(x).view(b, s, n, d)
446
+ return q, k, v
447
+
448
+ q, k, v = qkv_fn(x)
449
+ q = rope_apply(q, grid_sizes, freqs)
450
+ k = rope_apply(k, grid_sizes, freqs)
451
+
452
+
453
+ x = xFuserLongContextAttention()(
454
+ None,
455
+ query=half(q),
456
+ key=half(k),
457
+ value=half(v),
458
+ window_size=self.window_size)
459
+
460
+
461
+ # output
462
+ x = x.flatten(2)
463
+ x = self.o(x)
464
+
465
+ with torch.no_grad():
466
+ x_ref_attn_map = get_attn_map_with_target(q.type_as(x), k.type_as(x), grid_sizes[0],
467
+ ref_target_masks=ref_target_masks, enable_sp=True)
468
+
469
+ return x, x_ref_attn_map
470
+
471
+
472
+
473
+
474
+ def usp_crossattn_multi_forward_multitalk(self,
475
+ x: torch.Tensor,
476
+ encoder_hidden_states: torch.Tensor, # 1, 21, 64, C
477
+ shape=None,
478
+ x_ref_attn_map=None,
479
+ human_num=None) -> torch.Tensor:
480
+
481
+ N_t, N_h, N_w = shape
482
+ sp_size = get_sequence_parallel_world_size()
483
+ sp_rank = get_sequence_parallel_rank()
484
+ audio_tokens_per_frame = 32
485
+ visual_seqlen, frame_ids = split_token_counts_and_frame_ids(N_t, N_h * N_w, sp_size, sp_rank)
486
+ encoder_hidden_states = encoder_hidden_states[:, min(frame_ids):max(frame_ids)+1, ...]
487
+ encoder_hidden_states = rearrange(encoder_hidden_states, "B T N C -> B (T N) C")
488
+ N_a = len(frame_ids)
489
+ kv_seq = [audio_tokens_per_frame * human_num] * N_a
490
+
491
+ if human_num == 1:
492
+ return super(SingleStreamMutiAttention, self).forward(x, encoder_hidden_states, shape, enable_sp=True, kv_seq=kv_seq)
493
+
494
+
495
+ # get q for hidden_state
496
+ B, N, C = x.shape
497
+ q = self.q_linear(x)
498
+ q_shape = (B, N, self.num_heads, self.head_dim)
499
+ q = q.view(q_shape).permute((0, 2, 1, 3))
500
+
501
+ if self.qk_norm:
502
+ q = self.q_norm(q)
503
+
504
+ max_values = x_ref_attn_map.max(1).values[:, None, None]
505
+ min_values = x_ref_attn_map.min(1).values[:, None, None]
506
+ max_min_values = torch.cat([max_values, min_values], dim=2)
507
+ max_min_values = get_sp_group().all_gather(max_min_values, dim=1)
508
+
509
+ human1_max_value, human1_min_value = max_min_values[0, :, 0].max(), max_min_values[0, :, 1].min()
510
+ human2_max_value, human2_min_value = max_min_values[1, :, 0].max(), max_min_values[1, :, 1].min()
511
+
512
+ human1 = normalize_and_scale(x_ref_attn_map[0], (human1_min_value, human1_max_value), (self.rope_h1[0], self.rope_h1[1]))
513
+ human2 = normalize_and_scale(x_ref_attn_map[1], (human2_min_value, human2_max_value), (self.rope_h2[0], self.rope_h2[1]))
514
+ back = torch.full((x_ref_attn_map.size(1),), self.rope_bak, dtype=human1.dtype).to(human1.device)
515
+ max_indices = x_ref_attn_map.argmax(dim=0)
516
+ normalized_map = torch.stack([human1, human2, back], dim=1)
517
+ normalized_pos = normalized_map[range(x_ref_attn_map.size(1)), max_indices] # N
518
+ q = self.rope_1d(q, normalized_pos)
519
+
520
+ encoder_kv = self.kv_linear(encoder_hidden_states)
521
+ encoder_kv_shape = (B, encoder_hidden_states.size(1), 2, self.num_heads, self.head_dim)
522
+ encoder_kv = encoder_kv.view(encoder_kv_shape).permute((2, 0, 3, 1, 4))
523
+ encoder_k, encoder_v = encoder_kv.unbind(0) # B H N C
524
+
525
+ if self.qk_norm:
526
+ encoder_k = self.add_k_norm(encoder_k)
527
+
528
+ # position embedding for condition audio embeddings
529
+ per_frame = torch.zeros(audio_tokens_per_frame * human_num, dtype=encoder_k.dtype).to(encoder_k.device)
530
+ per_frame[:audio_tokens_per_frame] = (self.rope_h1[0] + self.rope_h1[1]) / 2
531
+ per_frame[audio_tokens_per_frame:] = (self.rope_h2[0] + self.rope_h2[1]) / 2
532
+ encoder_pos = torch.concat([per_frame]*N_a, dim=0)
533
+ encoder_k = self.rope_1d(encoder_k, encoder_pos)
534
+
535
+ # get attn
536
+ q = rearrange(q, "B H M K -> B M H K")
537
+ encoder_k = rearrange(encoder_k, "B H M K -> B M H K")
538
+ encoder_v = rearrange(encoder_v, "B H M K -> B M H K")
539
+ attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(visual_seqlen, kv_seq)
540
+ x = xformers.ops.memory_efficient_attention(q, encoder_k, encoder_v, attn_bias=attn_bias, op=None,)
541
+ x = rearrange(x, "B M H K -> B H M K")
542
+
543
+ # linear transform
544
+ x_output_shape = (B, N, C)
545
+ x = x.transpose(1, 2)
546
+ x = x.reshape(x_output_shape)
547
+ x = self.proj(x)
548
+ x = self.proj_drop(x)
549
+
550
+ return x
wan/first_last_frame2video.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import gc
3
+ import logging
4
+ import math
5
+ import os
6
+ import random
7
+ import sys
8
+ import types
9
+ from contextlib import contextmanager
10
+ from functools import partial
11
+
12
+ import numpy as np
13
+ import torch
14
+ import torch.cuda.amp as amp
15
+ import torch.distributed as dist
16
+ import torchvision.transforms.functional as TF
17
+ from tqdm import tqdm
18
+
19
+ from .distributed.fsdp import shard_model
20
+ from .modules.clip import CLIPModel
21
+ from .modules.model import WanModel
22
+ from .modules.t5 import T5EncoderModel
23
+ from .modules.vae import WanVAE
24
+ from .utils.fm_solvers import (
25
+ FlowDPMSolverMultistepScheduler,
26
+ get_sampling_sigmas,
27
+ retrieve_timesteps,
28
+ )
29
+ from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
30
+
31
+
32
+ class WanFLF2V:
33
+
34
+ def __init__(
35
+ self,
36
+ config,
37
+ checkpoint_dir,
38
+ device_id=0,
39
+ rank=0,
40
+ t5_fsdp=False,
41
+ dit_fsdp=False,
42
+ use_usp=False,
43
+ t5_cpu=False,
44
+ init_on_cpu=True,
45
+ ):
46
+ r"""
47
+ Initializes the image-to-video generation model components.
48
+
49
+ Args:
50
+ config (EasyDict):
51
+ Object containing model parameters initialized from config.py
52
+ checkpoint_dir (`str`):
53
+ Path to directory containing model checkpoints
54
+ device_id (`int`, *optional*, defaults to 0):
55
+ Id of target GPU device
56
+ rank (`int`, *optional*, defaults to 0):
57
+ Process rank for distributed training
58
+ t5_fsdp (`bool`, *optional*, defaults to False):
59
+ Enable FSDP sharding for T5 model
60
+ dit_fsdp (`bool`, *optional*, defaults to False):
61
+ Enable FSDP sharding for DiT model
62
+ use_usp (`bool`, *optional*, defaults to False):
63
+ Enable distribution strategy of USP.
64
+ t5_cpu (`bool`, *optional*, defaults to False):
65
+ Whether to place T5 model on CPU. Only works without t5_fsdp.
66
+ init_on_cpu (`bool`, *optional*, defaults to True):
67
+ Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
68
+ """
69
+ self.device = torch.device(f"cuda:{device_id}")
70
+ self.config = config
71
+ self.rank = rank
72
+ self.use_usp = use_usp
73
+ self.t5_cpu = t5_cpu
74
+
75
+ self.num_train_timesteps = config.num_train_timesteps
76
+ self.param_dtype = config.param_dtype
77
+
78
+ shard_fn = partial(shard_model, device_id=device_id)
79
+ self.text_encoder = T5EncoderModel(
80
+ text_len=config.text_len,
81
+ dtype=config.t5_dtype,
82
+ device=torch.device('cpu'),
83
+ checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
84
+ tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
85
+ shard_fn=shard_fn if t5_fsdp else None,
86
+ )
87
+
88
+ self.vae_stride = config.vae_stride
89
+ self.patch_size = config.patch_size
90
+ self.vae = WanVAE(
91
+ vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
92
+ device=self.device)
93
+
94
+ self.clip = CLIPModel(
95
+ dtype=config.clip_dtype,
96
+ device=self.device,
97
+ checkpoint_path=os.path.join(checkpoint_dir,
98
+ config.clip_checkpoint),
99
+ tokenizer_path=os.path.join(checkpoint_dir, config.clip_tokenizer))
100
+
101
+ logging.info(f"Creating WanModel from {checkpoint_dir}")
102
+ self.model = WanModel.from_pretrained(checkpoint_dir)
103
+ self.model.eval().requires_grad_(False)
104
+
105
+ if t5_fsdp or dit_fsdp or use_usp:
106
+ init_on_cpu = False
107
+
108
+ if use_usp:
109
+ from xfuser.core.distributed import get_sequence_parallel_world_size
110
+
111
+ from .distributed.xdit_context_parallel import (
112
+ usp_attn_forward,
113
+ usp_dit_forward,
114
+ )
115
+ for block in self.model.blocks:
116
+ block.self_attn.forward = types.MethodType(
117
+ usp_attn_forward, block.self_attn)
118
+ self.model.forward = types.MethodType(usp_dit_forward, self.model)
119
+ self.sp_size = get_sequence_parallel_world_size()
120
+ else:
121
+ self.sp_size = 1
122
+
123
+ if dist.is_initialized():
124
+ dist.barrier()
125
+ if dit_fsdp:
126
+ self.model = shard_fn(self.model)
127
+ else:
128
+ if not init_on_cpu:
129
+ self.model.to(self.device)
130
+
131
+ self.sample_neg_prompt = config.sample_neg_prompt
132
+
133
+ def generate(self,
134
+ input_prompt,
135
+ first_frame,
136
+ last_frame,
137
+ max_area=720 * 1280,
138
+ frame_num=81,
139
+ shift=16,
140
+ sample_solver='unipc',
141
+ sampling_steps=50,
142
+ guide_scale=5.5,
143
+ n_prompt="",
144
+ seed=-1,
145
+ offload_model=True):
146
+ r"""
147
+ Generates video frames from input first-last frame and text prompt using diffusion process.
148
+
149
+ Args:
150
+ input_prompt (`str`):
151
+ Text prompt for content generation.
152
+ first_frame (PIL.Image.Image):
153
+ Input image tensor. Shape: [3, H, W]
154
+ last_frame (PIL.Image.Image):
155
+ Input image tensor. Shape: [3, H, W]
156
+ [NOTE] If the sizes of first_frame and last_frame are mismatched, last_frame will be cropped & resized
157
+ to match first_frame.
158
+ max_area (`int`, *optional*, defaults to 720*1280):
159
+ Maximum pixel area for latent space calculation. Controls video resolution scaling
160
+ frame_num (`int`, *optional*, defaults to 81):
161
+ How many frames to sample from a video. The number should be 4n+1
162
+ shift (`float`, *optional*, defaults to 5.0):
163
+ Noise schedule shift parameter. Affects temporal dynamics
164
+ [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0.
165
+ sample_solver (`str`, *optional*, defaults to 'unipc'):
166
+ Solver used to sample the video.
167
+ sampling_steps (`int`, *optional*, defaults to 40):
168
+ Number of diffusion sampling steps. Higher values improve quality but slow generation
169
+ guide_scale (`float`, *optional*, defaults 5.0):
170
+ Classifier-free guidance scale. Controls prompt adherence vs. creativity
171
+ n_prompt (`str`, *optional*, defaults to ""):
172
+ Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
173
+ seed (`int`, *optional*, defaults to -1):
174
+ Random seed for noise generation. If -1, use random seed
175
+ offload_model (`bool`, *optional*, defaults to True):
176
+ If True, offloads models to CPU during generation to save VRAM
177
+
178
+ Returns:
179
+ torch.Tensor:
180
+ Generated video frames tensor. Dimensions: (C, N H, W) where:
181
+ - C: Color channels (3 for RGB)
182
+ - N: Number of frames (81)
183
+ - H: Frame height (from max_area)
184
+ - W: Frame width from max_area)
185
+ """
186
+ first_frame_size = first_frame.size
187
+ last_frame_size = last_frame.size
188
+ first_frame = TF.to_tensor(first_frame).sub_(0.5).div_(0.5).to(
189
+ self.device)
190
+ last_frame = TF.to_tensor(last_frame).sub_(0.5).div_(0.5).to(
191
+ self.device)
192
+
193
+ F = frame_num
194
+ first_frame_h, first_frame_w = first_frame.shape[1:]
195
+ aspect_ratio = first_frame_h / first_frame_w
196
+ lat_h = round(
197
+ np.sqrt(max_area * aspect_ratio) // self.vae_stride[1] //
198
+ self.patch_size[1] * self.patch_size[1])
199
+ lat_w = round(
200
+ np.sqrt(max_area / aspect_ratio) // self.vae_stride[2] //
201
+ self.patch_size[2] * self.patch_size[2])
202
+ first_frame_h = lat_h * self.vae_stride[1]
203
+ first_frame_w = lat_w * self.vae_stride[2]
204
+ if first_frame_size != last_frame_size:
205
+ # 1. resize
206
+ last_frame_resize_ratio = max(
207
+ first_frame_size[0] / last_frame_size[0],
208
+ first_frame_size[1] / last_frame_size[1])
209
+ last_frame_size = [
210
+ round(last_frame_size[0] * last_frame_resize_ratio),
211
+ round(last_frame_size[1] * last_frame_resize_ratio),
212
+ ]
213
+ # 2. center crop
214
+ last_frame = TF.center_crop(last_frame, last_frame_size)
215
+
216
+ max_seq_len = ((F - 1) // self.vae_stride[0] + 1) * lat_h * lat_w // (
217
+ self.patch_size[1] * self.patch_size[2])
218
+ max_seq_len = int(math.ceil(max_seq_len / self.sp_size)) * self.sp_size
219
+
220
+ seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
221
+ seed_g = torch.Generator(device=self.device)
222
+ seed_g.manual_seed(seed)
223
+ noise = torch.randn(
224
+ 16, (F - 1) // 4 + 1,
225
+ lat_h,
226
+ lat_w,
227
+ dtype=torch.float32,
228
+ generator=seed_g,
229
+ device=self.device)
230
+
231
+ msk = torch.ones(1, 81, lat_h, lat_w, device=self.device)
232
+ msk[:, 1:-1] = 0
233
+ msk = torch.concat([
234
+ torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]
235
+ ],
236
+ dim=1)
237
+ msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
238
+ msk = msk.transpose(1, 2)[0]
239
+
240
+ if n_prompt == "":
241
+ n_prompt = self.sample_neg_prompt
242
+
243
+ # preprocess
244
+ if not self.t5_cpu:
245
+ self.text_encoder.model.to(self.device)
246
+ context = self.text_encoder([input_prompt], self.device)
247
+ context_null = self.text_encoder([n_prompt], self.device)
248
+ if offload_model:
249
+ self.text_encoder.model.cpu()
250
+ else:
251
+ context = self.text_encoder([input_prompt], torch.device('cpu'))
252
+ context_null = self.text_encoder([n_prompt], torch.device('cpu'))
253
+ context = [t.to(self.device) for t in context]
254
+ context_null = [t.to(self.device) for t in context_null]
255
+
256
+ self.clip.model.to(self.device)
257
+ clip_context = self.clip.visual(
258
+ [first_frame[:, None, :, :], last_frame[:, None, :, :]])
259
+ if offload_model:
260
+ self.clip.model.cpu()
261
+
262
+ y = self.vae.encode([
263
+ torch.concat([
264
+ torch.nn.functional.interpolate(
265
+ first_frame[None].cpu(),
266
+ size=(first_frame_h, first_frame_w),
267
+ mode='bicubic').transpose(0, 1),
268
+ torch.zeros(3, F - 2, first_frame_h, first_frame_w),
269
+ torch.nn.functional.interpolate(
270
+ last_frame[None].cpu(),
271
+ size=(first_frame_h, first_frame_w),
272
+ mode='bicubic').transpose(0, 1),
273
+ ],
274
+ dim=1).to(self.device)
275
+ ])[0]
276
+ y = torch.concat([msk, y])
277
+
278
+ @contextmanager
279
+ def noop_no_sync():
280
+ yield
281
+
282
+ no_sync = getattr(self.model, 'no_sync', noop_no_sync)
283
+
284
+ # evaluation mode
285
+ with amp.autocast(dtype=self.param_dtype), torch.no_grad(), no_sync():
286
+
287
+ if sample_solver == 'unipc':
288
+ sample_scheduler = FlowUniPCMultistepScheduler(
289
+ num_train_timesteps=self.num_train_timesteps,
290
+ shift=1,
291
+ use_dynamic_shifting=False)
292
+ sample_scheduler.set_timesteps(
293
+ sampling_steps, device=self.device, shift=shift)
294
+ timesteps = sample_scheduler.timesteps
295
+ elif sample_solver == 'dpm++':
296
+ sample_scheduler = FlowDPMSolverMultistepScheduler(
297
+ num_train_timesteps=self.num_train_timesteps,
298
+ shift=1,
299
+ use_dynamic_shifting=False)
300
+ sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
301
+ timesteps, _ = retrieve_timesteps(
302
+ sample_scheduler,
303
+ device=self.device,
304
+ sigmas=sampling_sigmas)
305
+ else:
306
+ raise NotImplementedError("Unsupported solver.")
307
+
308
+ # sample videos
309
+ latent = noise
310
+
311
+ arg_c = {
312
+ 'context': [context[0]],
313
+ 'clip_fea': clip_context,
314
+ 'seq_len': max_seq_len,
315
+ 'y': [y],
316
+ }
317
+
318
+ arg_null = {
319
+ 'context': context_null,
320
+ 'clip_fea': clip_context,
321
+ 'seq_len': max_seq_len,
322
+ 'y': [y],
323
+ }
324
+
325
+ if offload_model:
326
+ torch.cuda.empty_cache()
327
+
328
+ self.model.to(self.device)
329
+ for _, t in enumerate(tqdm(timesteps)):
330
+ latent_model_input = [latent.to(self.device)]
331
+ timestep = [t]
332
+
333
+ timestep = torch.stack(timestep).to(self.device)
334
+
335
+ noise_pred_cond = self.model(
336
+ latent_model_input, t=timestep, **arg_c)[0].to(
337
+ torch.device('cpu') if offload_model else self.device)
338
+ if offload_model:
339
+ torch.cuda.empty_cache()
340
+ noise_pred_uncond = self.model(
341
+ latent_model_input, t=timestep, **arg_null)[0].to(
342
+ torch.device('cpu') if offload_model else self.device)
343
+ if offload_model:
344
+ torch.cuda.empty_cache()
345
+ noise_pred = noise_pred_uncond + guide_scale * (
346
+ noise_pred_cond - noise_pred_uncond)
347
+
348
+ latent = latent.to(
349
+ torch.device('cpu') if offload_model else self.device)
350
+
351
+ temp_x0 = sample_scheduler.step(
352
+ noise_pred.unsqueeze(0),
353
+ t,
354
+ latent.unsqueeze(0),
355
+ return_dict=False,
356
+ generator=seed_g)[0]
357
+ latent = temp_x0.squeeze(0)
358
+
359
+ x0 = [latent.to(self.device)]
360
+ del latent_model_input, timestep
361
+
362
+ if offload_model:
363
+ self.model.cpu()
364
+ torch.cuda.empty_cache()
365
+
366
+ if self.rank == 0:
367
+ videos = self.vae.decode(x0)
368
+
369
+ del noise, latent
370
+ del sample_scheduler
371
+ if offload_model:
372
+ gc.collect()
373
+ torch.cuda.synchronize()
374
+ if dist.is_initialized():
375
+ dist.barrier()
376
+
377
+ return videos[0] if self.rank == 0 else None
wan/image2video.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import gc
3
+ import logging
4
+ import math
5
+ import os
6
+ import random
7
+ import sys
8
+ import types
9
+ from contextlib import contextmanager
10
+ from functools import partial
11
+
12
+ import numpy as np
13
+ import torch
14
+ import torch.cuda.amp as amp
15
+ import torch.distributed as dist
16
+ import torchvision.transforms.functional as TF
17
+ from tqdm import tqdm
18
+
19
+ from .distributed.fsdp import shard_model
20
+ from .modules.clip import CLIPModel
21
+ from .modules.model import WanModel
22
+ from .modules.t5 import T5EncoderModel
23
+ from .modules.vae import WanVAE
24
+ from .utils.fm_solvers import (
25
+ FlowDPMSolverMultistepScheduler,
26
+ get_sampling_sigmas,
27
+ retrieve_timesteps,
28
+ )
29
+ from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
30
+
31
+
32
+ class WanI2V:
33
+
34
+ def __init__(
35
+ self,
36
+ config,
37
+ checkpoint_dir,
38
+ device_id=0,
39
+ rank=0,
40
+ t5_fsdp=False,
41
+ dit_fsdp=False,
42
+ use_usp=False,
43
+ t5_cpu=False,
44
+ init_on_cpu=True,
45
+ ):
46
+ r"""
47
+ Initializes the image-to-video generation model components.
48
+
49
+ Args:
50
+ config (EasyDict):
51
+ Object containing model parameters initialized from config.py
52
+ checkpoint_dir (`str`):
53
+ Path to directory containing model checkpoints
54
+ device_id (`int`, *optional*, defaults to 0):
55
+ Id of target GPU device
56
+ rank (`int`, *optional*, defaults to 0):
57
+ Process rank for distributed training
58
+ t5_fsdp (`bool`, *optional*, defaults to False):
59
+ Enable FSDP sharding for T5 model
60
+ dit_fsdp (`bool`, *optional*, defaults to False):
61
+ Enable FSDP sharding for DiT model
62
+ use_usp (`bool`, *optional*, defaults to False):
63
+ Enable distribution strategy of USP.
64
+ t5_cpu (`bool`, *optional*, defaults to False):
65
+ Whether to place T5 model on CPU. Only works without t5_fsdp.
66
+ init_on_cpu (`bool`, *optional*, defaults to True):
67
+ Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
68
+ """
69
+ self.device = torch.device(f"cuda:{device_id}")
70
+ self.config = config
71
+ self.rank = rank
72
+ self.use_usp = use_usp
73
+ self.t5_cpu = t5_cpu
74
+
75
+ self.num_train_timesteps = config.num_train_timesteps
76
+ self.param_dtype = config.param_dtype
77
+
78
+ shard_fn = partial(shard_model, device_id=device_id)
79
+ self.text_encoder = T5EncoderModel(
80
+ text_len=config.text_len,
81
+ dtype=config.t5_dtype,
82
+ device=torch.device('cpu'),
83
+ checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
84
+ tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
85
+ shard_fn=shard_fn if t5_fsdp else None,
86
+ )
87
+
88
+ self.vae_stride = config.vae_stride
89
+ self.patch_size = config.patch_size
90
+ self.vae = WanVAE(
91
+ vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
92
+ device=self.device)
93
+
94
+ self.clip = CLIPModel(
95
+ dtype=config.clip_dtype,
96
+ device=self.device,
97
+ checkpoint_path=os.path.join(checkpoint_dir,
98
+ config.clip_checkpoint),
99
+ tokenizer_path=os.path.join(checkpoint_dir, config.clip_tokenizer))
100
+
101
+ logging.info(f"Creating WanModel from {checkpoint_dir}")
102
+ self.model = WanModel.from_pretrained(checkpoint_dir)
103
+ self.model.eval().requires_grad_(False)
104
+
105
+ if t5_fsdp or dit_fsdp or use_usp:
106
+ init_on_cpu = False
107
+
108
+ if use_usp:
109
+ from xfuser.core.distributed import get_sequence_parallel_world_size
110
+
111
+ from .distributed.xdit_context_parallel import (
112
+ usp_attn_forward,
113
+ usp_dit_forward,
114
+ )
115
+ for block in self.model.blocks:
116
+ block.self_attn.forward = types.MethodType(
117
+ usp_attn_forward, block.self_attn)
118
+ self.model.forward = types.MethodType(usp_dit_forward, self.model)
119
+ self.sp_size = get_sequence_parallel_world_size()
120
+ else:
121
+ self.sp_size = 1
122
+
123
+ if dist.is_initialized():
124
+ dist.barrier()
125
+ if dit_fsdp:
126
+ self.model = shard_fn(self.model)
127
+ else:
128
+ if not init_on_cpu:
129
+ self.model.to(self.device)
130
+
131
+ self.sample_neg_prompt = config.sample_neg_prompt
132
+
133
+ def generate(self,
134
+ input_prompt,
135
+ img,
136
+ max_area=720 * 1280,
137
+ frame_num=81,
138
+ shift=5.0,
139
+ sample_solver='unipc',
140
+ sampling_steps=40,
141
+ guide_scale=5.0,
142
+ n_prompt="",
143
+ seed=-1,
144
+ offload_model=True):
145
+ r"""
146
+ Generates video frames from input image and text prompt using diffusion process.
147
+
148
+ Args:
149
+ input_prompt (`str`):
150
+ Text prompt for content generation.
151
+ img (PIL.Image.Image):
152
+ Input image tensor. Shape: [3, H, W]
153
+ max_area (`int`, *optional*, defaults to 720*1280):
154
+ Maximum pixel area for latent space calculation. Controls video resolution scaling
155
+ frame_num (`int`, *optional*, defaults to 81):
156
+ How many frames to sample from a video. The number should be 4n+1
157
+ shift (`float`, *optional*, defaults to 5.0):
158
+ Noise schedule shift parameter. Affects temporal dynamics
159
+ [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0.
160
+ sample_solver (`str`, *optional*, defaults to 'unipc'):
161
+ Solver used to sample the video.
162
+ sampling_steps (`int`, *optional*, defaults to 40):
163
+ Number of diffusion sampling steps. Higher values improve quality but slow generation
164
+ guide_scale (`float`, *optional*, defaults 5.0):
165
+ Classifier-free guidance scale. Controls prompt adherence vs. creativity
166
+ n_prompt (`str`, *optional*, defaults to ""):
167
+ Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
168
+ seed (`int`, *optional*, defaults to -1):
169
+ Random seed for noise generation. If -1, use random seed
170
+ offload_model (`bool`, *optional*, defaults to True):
171
+ If True, offloads models to CPU during generation to save VRAM
172
+
173
+ Returns:
174
+ torch.Tensor:
175
+ Generated video frames tensor. Dimensions: (C, N H, W) where:
176
+ - C: Color channels (3 for RGB)
177
+ - N: Number of frames (81)
178
+ - H: Frame height (from max_area)
179
+ - W: Frame width from max_area)
180
+ """
181
+ img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(self.device)
182
+
183
+ F = frame_num
184
+ h, w = img.shape[1:]
185
+ aspect_ratio = h / w
186
+ lat_h = round(
187
+ np.sqrt(max_area * aspect_ratio) // self.vae_stride[1] //
188
+ self.patch_size[1] * self.patch_size[1])
189
+ lat_w = round(
190
+ np.sqrt(max_area / aspect_ratio) // self.vae_stride[2] //
191
+ self.patch_size[2] * self.patch_size[2])
192
+ h = lat_h * self.vae_stride[1]
193
+ w = lat_w * self.vae_stride[2]
194
+
195
+ max_seq_len = ((F - 1) // self.vae_stride[0] + 1) * lat_h * lat_w // (
196
+ self.patch_size[1] * self.patch_size[2])
197
+ max_seq_len = int(math.ceil(max_seq_len / self.sp_size)) * self.sp_size
198
+
199
+ seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
200
+ seed_g = torch.Generator(device=self.device)
201
+ seed_g.manual_seed(seed)
202
+ noise = torch.randn(
203
+ 16, (F - 1) // 4 + 1,
204
+ lat_h,
205
+ lat_w,
206
+ dtype=torch.float32,
207
+ generator=seed_g,
208
+ device=self.device)
209
+
210
+ msk = torch.ones(1, 81, lat_h, lat_w, device=self.device)
211
+ msk[:, 1:] = 0
212
+ msk = torch.concat([
213
+ torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]
214
+ ],
215
+ dim=1)
216
+ msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
217
+ msk = msk.transpose(1, 2)[0]
218
+
219
+ if n_prompt == "":
220
+ n_prompt = self.sample_neg_prompt
221
+
222
+ # preprocess
223
+ if not self.t5_cpu:
224
+ self.text_encoder.model.to(self.device)
225
+ context = self.text_encoder([input_prompt], self.device)
226
+ context_null = self.text_encoder([n_prompt], self.device)
227
+ if offload_model:
228
+ self.text_encoder.model.cpu()
229
+ else:
230
+ context = self.text_encoder([input_prompt], torch.device('cpu'))
231
+ context_null = self.text_encoder([n_prompt], torch.device('cpu'))
232
+ context = [t.to(self.device) for t in context]
233
+ context_null = [t.to(self.device) for t in context_null]
234
+
235
+ self.clip.model.to(self.device)
236
+ clip_context = self.clip.visual([img[:, None, :, :]])
237
+ if offload_model:
238
+ self.clip.model.cpu()
239
+
240
+ y = self.vae.encode([
241
+ torch.concat([
242
+ torch.nn.functional.interpolate(
243
+ img[None].cpu(), size=(h, w), mode='bicubic').transpose(
244
+ 0, 1),
245
+ torch.zeros(3, F - 1, h, w)
246
+ ],
247
+ dim=1).to(self.device)
248
+ ])[0]
249
+ y = torch.concat([msk, y])
250
+
251
+ @contextmanager
252
+ def noop_no_sync():
253
+ yield
254
+
255
+ no_sync = getattr(self.model, 'no_sync', noop_no_sync)
256
+
257
+ # evaluation mode
258
+ with amp.autocast(dtype=self.param_dtype), torch.no_grad(), no_sync():
259
+
260
+ if sample_solver == 'unipc':
261
+ sample_scheduler = FlowUniPCMultistepScheduler(
262
+ num_train_timesteps=self.num_train_timesteps,
263
+ shift=1,
264
+ use_dynamic_shifting=False)
265
+ sample_scheduler.set_timesteps(
266
+ sampling_steps, device=self.device, shift=shift)
267
+ timesteps = sample_scheduler.timesteps
268
+ elif sample_solver == 'dpm++':
269
+ sample_scheduler = FlowDPMSolverMultistepScheduler(
270
+ num_train_timesteps=self.num_train_timesteps,
271
+ shift=1,
272
+ use_dynamic_shifting=False)
273
+ sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
274
+ timesteps, _ = retrieve_timesteps(
275
+ sample_scheduler,
276
+ device=self.device,
277
+ sigmas=sampling_sigmas)
278
+ else:
279
+ raise NotImplementedError("Unsupported solver.")
280
+
281
+ # sample videos
282
+ latent = noise
283
+
284
+ arg_c = {
285
+ 'context': [context[0]],
286
+ 'clip_fea': clip_context,
287
+ 'seq_len': max_seq_len,
288
+ 'y': [y],
289
+ }
290
+
291
+ arg_null = {
292
+ 'context': context_null,
293
+ 'clip_fea': clip_context,
294
+ 'seq_len': max_seq_len,
295
+ 'y': [y],
296
+ }
297
+
298
+ if offload_model:
299
+ torch.cuda.empty_cache()
300
+
301
+ self.model.to(self.device)
302
+ for _, t in enumerate(tqdm(timesteps)):
303
+ latent_model_input = [latent.to(self.device)]
304
+ timestep = [t]
305
+
306
+ timestep = torch.stack(timestep).to(self.device)
307
+
308
+ noise_pred_cond = self.model(
309
+ latent_model_input, t=timestep, **arg_c)[0].to(
310
+ torch.device('cpu') if offload_model else self.device)
311
+ if offload_model:
312
+ torch.cuda.empty_cache()
313
+ noise_pred_uncond = self.model(
314
+ latent_model_input, t=timestep, **arg_null)[0].to(
315
+ torch.device('cpu') if offload_model else self.device)
316
+ if offload_model:
317
+ torch.cuda.empty_cache()
318
+ noise_pred = noise_pred_uncond + guide_scale * (
319
+ noise_pred_cond - noise_pred_uncond)
320
+
321
+ latent = latent.to(
322
+ torch.device('cpu') if offload_model else self.device)
323
+
324
+ temp_x0 = sample_scheduler.step(
325
+ noise_pred.unsqueeze(0),
326
+ t,
327
+ latent.unsqueeze(0),
328
+ return_dict=False,
329
+ generator=seed_g)[0]
330
+ latent = temp_x0.squeeze(0)
331
+
332
+ x0 = [latent.to(self.device)]
333
+ del latent_model_input, timestep
334
+
335
+ if offload_model:
336
+ self.model.cpu()
337
+ torch.cuda.empty_cache()
338
+
339
+ if self.rank == 0:
340
+ videos = self.vae.decode(x0)
341
+
342
+ del noise, latent
343
+ del sample_scheduler
344
+ if offload_model:
345
+ gc.collect()
346
+ torch.cuda.synchronize()
347
+ if dist.is_initialized():
348
+ dist.barrier()
349
+
350
+ return videos[0] if self.rank == 0 else None
wan/modules/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .attention import flash_attention
2
+ from .model import WanModel
3
+ from .t5 import T5Decoder, T5Encoder, T5EncoderModel, T5Model
4
+ from .tokenizers import HuggingfaceTokenizer
5
+ from .vace_model import VaceWanModel
6
+ from .vae import WanVAE
7
+
8
+ __all__ = [
9
+ 'WanVAE',
10
+ 'WanModel',
11
+ 'VaceWanModel',
12
+ 'T5Model',
13
+ 'T5Encoder',
14
+ 'T5Decoder',
15
+ 'T5EncoderModel',
16
+ 'HuggingfaceTokenizer',
17
+ 'flash_attention',
18
+ ]
wan/modules/attention.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import torch
3
+ import torch.nn as nn
4
+ from einops import rearrange, repeat
5
+ from ..utils.multitalk_utils import RotaryPositionalEmbedding1D, normalize_and_scale, split_token_counts_and_frame_ids
6
+ from xfuser.core.distributed import (
7
+ get_sequence_parallel_rank,
8
+ get_sequence_parallel_world_size,
9
+ get_sp_group,
10
+ )
11
+ import xformers.ops
12
+
13
+ try:
14
+ import flash_attn_interface
15
+ FLASH_ATTN_3_AVAILABLE = True
16
+ except ModuleNotFoundError:
17
+ FLASH_ATTN_3_AVAILABLE = False
18
+
19
+ try:
20
+ import flash_attn
21
+ FLASH_ATTN_2_AVAILABLE = True
22
+ except ModuleNotFoundError:
23
+ FLASH_ATTN_2_AVAILABLE = False
24
+
25
+ import warnings
26
+
27
+ __all__ = [
28
+ 'flash_attention',
29
+ 'attention',
30
+ ]
31
+
32
+
33
+ def flash_attention(
34
+ q,
35
+ k,
36
+ v,
37
+ q_lens=None,
38
+ k_lens=None,
39
+ dropout_p=0.,
40
+ softmax_scale=None,
41
+ q_scale=None,
42
+ causal=False,
43
+ window_size=(-1, -1),
44
+ deterministic=False,
45
+ dtype=torch.bfloat16,
46
+ version=None,
47
+ ):
48
+ """
49
+ q: [B, Lq, Nq, C1].
50
+ k: [B, Lk, Nk, C1].
51
+ v: [B, Lk, Nk, C2]. Nq must be divisible by Nk.
52
+ q_lens: [B].
53
+ k_lens: [B].
54
+ dropout_p: float. Dropout probability.
55
+ softmax_scale: float. The scaling of QK^T before applying softmax.
56
+ causal: bool. Whether to apply causal attention mask.
57
+ window_size: (left right). If not (-1, -1), apply sliding window local attention.
58
+ deterministic: bool. If True, slightly slower and uses more memory.
59
+ dtype: torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
60
+ """
61
+ half_dtypes = (torch.float16, torch.bfloat16)
62
+ assert dtype in half_dtypes
63
+ assert q.device.type == 'cuda' and q.size(-1) <= 256
64
+
65
+ # params
66
+ b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
67
+
68
+ def half(x):
69
+ return x if x.dtype in half_dtypes else x.to(dtype)
70
+
71
+ # preprocess query
72
+ if q_lens is None:
73
+ q = half(q.flatten(0, 1))
74
+ q_lens = torch.tensor(
75
+ [lq] * b, dtype=torch.int32).to(
76
+ device=q.device, non_blocking=True)
77
+ else:
78
+ q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
79
+
80
+ # preprocess key, value
81
+ if k_lens is None:
82
+ k = half(k.flatten(0, 1))
83
+ v = half(v.flatten(0, 1))
84
+ k_lens = torch.tensor(
85
+ [lk] * b, dtype=torch.int32).to(
86
+ device=k.device, non_blocking=True)
87
+ else:
88
+ k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
89
+ v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
90
+
91
+ q = q.to(v.dtype)
92
+ k = k.to(v.dtype)
93
+
94
+ if q_scale is not None:
95
+ q = q * q_scale
96
+
97
+ if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
98
+ warnings.warn(
99
+ 'Flash attention 3 is not available, use flash attention 2 instead.'
100
+ )
101
+
102
+ # apply attention
103
+ if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
104
+ # Note: dropout_p, window_size are not supported in FA3 now.
105
+ x = flash_attn_interface.flash_attn_varlen_func(
106
+ q=q,
107
+ k=k,
108
+ v=v,
109
+ cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
110
+ 0, dtype=torch.int32).to(q.device, non_blocking=True),
111
+ cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
112
+ 0, dtype=torch.int32).to(q.device, non_blocking=True),
113
+ seqused_q=None,
114
+ seqused_k=None,
115
+ max_seqlen_q=lq,
116
+ max_seqlen_k=lk,
117
+ softmax_scale=softmax_scale,
118
+ causal=causal,
119
+ deterministic=deterministic)[0].unflatten(0, (b, lq))
120
+ else:
121
+ assert FLASH_ATTN_2_AVAILABLE
122
+ x = flash_attn.flash_attn_varlen_func(
123
+ q=q,
124
+ k=k,
125
+ v=v,
126
+ cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
127
+ 0, dtype=torch.int32).to(q.device, non_blocking=True),
128
+ cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
129
+ 0, dtype=torch.int32).to(q.device, non_blocking=True),
130
+ max_seqlen_q=lq,
131
+ max_seqlen_k=lk,
132
+ dropout_p=dropout_p,
133
+ softmax_scale=softmax_scale,
134
+ causal=causal,
135
+ window_size=window_size,
136
+ deterministic=deterministic).unflatten(0, (b, lq))
137
+
138
+ # output
139
+ return x.type(out_dtype)
140
+
141
+
142
+ def attention(
143
+ q,
144
+ k,
145
+ v,
146
+ q_lens=None,
147
+ k_lens=None,
148
+ dropout_p=0.,
149
+ softmax_scale=None,
150
+ q_scale=None,
151
+ causal=False,
152
+ window_size=(-1, -1),
153
+ deterministic=False,
154
+ dtype=torch.bfloat16,
155
+ fa_version=None,
156
+ ):
157
+ if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
158
+ return flash_attention(
159
+ q=q,
160
+ k=k,
161
+ v=v,
162
+ q_lens=q_lens,
163
+ k_lens=k_lens,
164
+ dropout_p=dropout_p,
165
+ softmax_scale=softmax_scale,
166
+ q_scale=q_scale,
167
+ causal=causal,
168
+ window_size=window_size,
169
+ deterministic=deterministic,
170
+ dtype=dtype,
171
+ version=fa_version,
172
+ )
173
+ else:
174
+ if q_lens is not None or k_lens is not None:
175
+ warnings.warn(
176
+ 'Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance.'
177
+ )
178
+ attn_mask = None
179
+
180
+ q = q.transpose(1, 2).to(dtype)
181
+ k = k.transpose(1, 2).to(dtype)
182
+ v = v.transpose(1, 2).to(dtype)
183
+
184
+ out = torch.nn.functional.scaled_dot_product_attention(
185
+ q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p)
186
+
187
+ out = out.transpose(1, 2).contiguous()
188
+ return out
189
+
190
+
191
+ class SingleStreamAttention(nn.Module):
192
+ def __init__(
193
+ self,
194
+ dim: int,
195
+ encoder_hidden_states_dim: int,
196
+ num_heads: int,
197
+ qkv_bias: bool,
198
+ qk_norm: bool,
199
+ norm_layer: nn.Module,
200
+ attn_drop: float = 0.0,
201
+ proj_drop: float = 0.0,
202
+ eps: float = 1e-6,
203
+ ) -> None:
204
+ super().__init__()
205
+ assert dim % num_heads == 0, "dim should be divisible by num_heads"
206
+ self.dim = dim
207
+ self.encoder_hidden_states_dim = encoder_hidden_states_dim
208
+ self.num_heads = num_heads
209
+ self.head_dim = dim // num_heads
210
+ self.scale = self.head_dim**-0.5
211
+ self.qk_norm = qk_norm
212
+
213
+ self.q_linear = nn.Linear(dim, dim, bias=qkv_bias)
214
+
215
+ self.q_norm = norm_layer(self.head_dim, eps=eps) if qk_norm else nn.Identity()
216
+ self.k_norm = norm_layer(self.head_dim,eps=eps) if qk_norm else nn.Identity()
217
+
218
+ self.attn_drop = nn.Dropout(attn_drop)
219
+ self.proj = nn.Linear(dim, dim)
220
+ self.proj_drop = nn.Dropout(proj_drop)
221
+
222
+ self.kv_linear = nn.Linear(encoder_hidden_states_dim, dim * 2, bias=qkv_bias)
223
+
224
+ self.add_q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
225
+ self.add_k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
226
+
227
+ def forward(self, x: torch.Tensor, encoder_hidden_states: torch.Tensor, shape=None, enable_sp=False, kv_seq=None) -> torch.Tensor:
228
+
229
+ N_t, N_h, N_w = shape
230
+ if not enable_sp:
231
+ x = rearrange(x, "B (N_t S) C -> (B N_t) S C", N_t=N_t)
232
+
233
+ # get q for hidden_state
234
+ B, N, C = x.shape
235
+ q = self.q_linear(x)
236
+ q_shape = (B, N, self.num_heads, self.head_dim)
237
+ q = q.view(q_shape).permute((0, 2, 1, 3))
238
+
239
+ if self.qk_norm:
240
+ q = self.q_norm(q)
241
+
242
+ # get kv from encoder_hidden_states
243
+ _, N_a, _ = encoder_hidden_states.shape
244
+ encoder_kv = self.kv_linear(encoder_hidden_states)
245
+ encoder_kv_shape = (B, N_a, 2, self.num_heads, self.head_dim)
246
+ encoder_kv = encoder_kv.view(encoder_kv_shape).permute((2, 0, 3, 1, 4))
247
+ encoder_k, encoder_v = encoder_kv.unbind(0)
248
+
249
+ if self.qk_norm:
250
+ encoder_k = self.add_k_norm(encoder_k)
251
+
252
+
253
+ q = rearrange(q, "B H M K -> B M H K")
254
+ encoder_k = rearrange(encoder_k, "B H M K -> B M H K")
255
+ encoder_v = rearrange(encoder_v, "B H M K -> B M H K")
256
+
257
+ if enable_sp:
258
+ # context parallel
259
+ sp_size = get_sequence_parallel_world_size()
260
+ sp_rank = get_sequence_parallel_rank()
261
+ visual_seqlen, _ = split_token_counts_and_frame_ids(N_t, N_h * N_w, sp_size, sp_rank)
262
+ assert kv_seq is not None, f"kv_seq should not be None."
263
+ attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(visual_seqlen, kv_seq)
264
+ else:
265
+ attn_bias = None
266
+ x = xformers.ops.memory_efficient_attention(q, encoder_k, encoder_v, attn_bias=attn_bias, op=None,)
267
+ x = rearrange(x, "B M H K -> B H M K")
268
+
269
+ # linear transform
270
+ x_output_shape = (B, N, C)
271
+ x = x.transpose(1, 2)
272
+ x = x.reshape(x_output_shape)
273
+ x = self.proj(x)
274
+ x = self.proj_drop(x)
275
+
276
+ if not enable_sp:
277
+ # reshape x to origin shape
278
+ x = rearrange(x, "(B N_t) S C -> B (N_t S) C", N_t=N_t)
279
+
280
+ return x
281
+
282
+ class SingleStreamMutiAttention(SingleStreamAttention):
283
+ def __init__(
284
+ self,
285
+ dim: int,
286
+ encoder_hidden_states_dim: int,
287
+ num_heads: int,
288
+ qkv_bias: bool,
289
+ qk_norm: bool,
290
+ norm_layer: nn.Module,
291
+ attn_drop: float = 0.0,
292
+ proj_drop: float = 0.0,
293
+ eps: float = 1e-6,
294
+ class_range: int = 24,
295
+ class_interval: int = 4,
296
+ ) -> None:
297
+ super().__init__(
298
+ dim=dim,
299
+ encoder_hidden_states_dim=encoder_hidden_states_dim,
300
+ num_heads=num_heads,
301
+ qkv_bias=qkv_bias,
302
+ qk_norm=qk_norm,
303
+ norm_layer=norm_layer,
304
+ attn_drop=attn_drop,
305
+ proj_drop=proj_drop,
306
+ eps=eps,
307
+ )
308
+ self.class_interval = class_interval
309
+ self.class_range = class_range
310
+ self.rope_h1 = (0, self.class_interval)
311
+ self.rope_h2 = (self.class_range - self.class_interval, self.class_range)
312
+ self.rope_bak = int(self.class_range // 2)
313
+
314
+ self.rope_1d = RotaryPositionalEmbedding1D(self.head_dim)
315
+
316
+ def forward(self,
317
+ x: torch.Tensor,
318
+ encoder_hidden_states: torch.Tensor,
319
+ shape=None,
320
+ x_ref_attn_map=None,
321
+ human_num=None) -> torch.Tensor:
322
+
323
+ encoder_hidden_states = encoder_hidden_states.squeeze(0)
324
+ if human_num == 1:
325
+ return super().forward(x, encoder_hidden_states, shape)
326
+
327
+ N_t, _, _ = shape
328
+ x = rearrange(x, "B (N_t S) C -> (B N_t) S C", N_t=N_t)
329
+
330
+ # get q for hidden_state
331
+ B, N, C = x.shape
332
+ q = self.q_linear(x)
333
+ q_shape = (B, N, self.num_heads, self.head_dim)
334
+ q = q.view(q_shape).permute((0, 2, 1, 3))
335
+
336
+ if self.qk_norm:
337
+ q = self.q_norm(q)
338
+
339
+
340
+ max_values = x_ref_attn_map.max(1).values[:, None, None]
341
+ min_values = x_ref_attn_map.min(1).values[:, None, None]
342
+ max_min_values = torch.cat([max_values, min_values], dim=2)
343
+
344
+ human1_max_value, human1_min_value = max_min_values[0, :, 0].max(), max_min_values[0, :, 1].min()
345
+ human2_max_value, human2_min_value = max_min_values[1, :, 0].max(), max_min_values[1, :, 1].min()
346
+
347
+ human1 = normalize_and_scale(x_ref_attn_map[0], (human1_min_value, human1_max_value), (self.rope_h1[0], self.rope_h1[1]))
348
+ human2 = normalize_and_scale(x_ref_attn_map[1], (human2_min_value, human2_max_value), (self.rope_h2[0], self.rope_h2[1]))
349
+ back = torch.full((x_ref_attn_map.size(1),), self.rope_bak, dtype=human1.dtype).to(human1.device)
350
+ max_indices = x_ref_attn_map.argmax(dim=0)
351
+ normalized_map = torch.stack([human1, human2, back], dim=1)
352
+ normalized_pos = normalized_map[range(x_ref_attn_map.size(1)), max_indices] # N
353
+
354
+ q = rearrange(q, "(B N_t) H S C -> B H (N_t S) C", N_t=N_t)
355
+ q = self.rope_1d(q, normalized_pos)
356
+ q = rearrange(q, "B H (N_t S) C -> (B N_t) H S C", N_t=N_t)
357
+
358
+ _, N_a, _ = encoder_hidden_states.shape
359
+ encoder_kv = self.kv_linear(encoder_hidden_states)
360
+ encoder_kv_shape = (B, N_a, 2, self.num_heads, self.head_dim)
361
+ encoder_kv = encoder_kv.view(encoder_kv_shape).permute((2, 0, 3, 1, 4))
362
+ encoder_k, encoder_v = encoder_kv.unbind(0)
363
+
364
+ if self.qk_norm:
365
+ encoder_k = self.add_k_norm(encoder_k)
366
+
367
+
368
+ per_frame = torch.zeros(N_a, dtype=encoder_k.dtype).to(encoder_k.device)
369
+ per_frame[:per_frame.size(0)//2] = (self.rope_h1[0] + self.rope_h1[1]) / 2
370
+ per_frame[per_frame.size(0)//2:] = (self.rope_h2[0] + self.rope_h2[1]) / 2
371
+ encoder_pos = torch.concat([per_frame]*N_t, dim=0)
372
+ encoder_k = rearrange(encoder_k, "(B N_t) H S C -> B H (N_t S) C", N_t=N_t)
373
+ encoder_k = self.rope_1d(encoder_k, encoder_pos)
374
+ encoder_k = rearrange(encoder_k, "B H (N_t S) C -> (B N_t) H S C", N_t=N_t)
375
+
376
+
377
+ q = rearrange(q, "B H M K -> B M H K")
378
+ encoder_k = rearrange(encoder_k, "B H M K -> B M H K")
379
+ encoder_v = rearrange(encoder_v, "B H M K -> B M H K")
380
+ x = xformers.ops.memory_efficient_attention(q, encoder_k, encoder_v, attn_bias=None, op=None,)
381
+ x = rearrange(x, "B M H K -> B H M K")
382
+
383
+ # linear transform
384
+ x_output_shape = (B, N, C)
385
+ x = x.transpose(1, 2)
386
+ x = x.reshape(x_output_shape)
387
+ x = self.proj(x)
388
+ x = self.proj_drop(x)
389
+
390
+ # reshape x to origin shape
391
+ x = rearrange(x, "(B N_t) S C -> B (N_t S) C", N_t=N_t)
392
+
393
+ return x
wan/modules/clip.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Modified from ``https://github.com/openai/CLIP'' and ``https://github.com/mlfoundations/open_clip''
2
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
3
+ import logging
4
+ import math
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ import torchvision.transforms as T
10
+
11
+ from .attention import flash_attention
12
+ from .tokenizers import HuggingfaceTokenizer
13
+ from .xlm_roberta import XLMRoberta
14
+
15
+ __all__ = [
16
+ 'XLMRobertaCLIP',
17
+ 'clip_xlm_roberta_vit_h_14',
18
+ 'CLIPModel',
19
+ ]
20
+
21
+
22
+ def pos_interpolate(pos, seq_len):
23
+ if pos.size(1) == seq_len:
24
+ return pos
25
+ else:
26
+ src_grid = int(math.sqrt(pos.size(1)))
27
+ tar_grid = int(math.sqrt(seq_len))
28
+ n = pos.size(1) - src_grid * src_grid
29
+ return torch.cat([
30
+ pos[:, :n],
31
+ F.interpolate(
32
+ pos[:, n:].float().reshape(1, src_grid, src_grid, -1).permute(
33
+ 0, 3, 1, 2),
34
+ size=(tar_grid, tar_grid),
35
+ mode='bicubic',
36
+ align_corners=False).flatten(2).transpose(1, 2)
37
+ ],
38
+ dim=1)
39
+
40
+
41
+ class QuickGELU(nn.Module):
42
+
43
+ def forward(self, x):
44
+ return x * torch.sigmoid(1.702 * x)
45
+
46
+
47
+ class LayerNorm(nn.LayerNorm):
48
+
49
+ def forward(self, x):
50
+ return super().forward(x.float()).type_as(x)
51
+
52
+
53
+ class SelfAttention(nn.Module):
54
+
55
+ def __init__(self,
56
+ dim,
57
+ num_heads,
58
+ causal=False,
59
+ attn_dropout=0.0,
60
+ proj_dropout=0.0):
61
+ assert dim % num_heads == 0
62
+ super().__init__()
63
+ self.dim = dim
64
+ self.num_heads = num_heads
65
+ self.head_dim = dim // num_heads
66
+ self.causal = causal
67
+ self.attn_dropout = attn_dropout
68
+ self.proj_dropout = proj_dropout
69
+
70
+ # layers
71
+ self.to_qkv = nn.Linear(dim, dim * 3)
72
+ self.proj = nn.Linear(dim, dim)
73
+
74
+ def forward(self, x):
75
+ """
76
+ x: [B, L, C].
77
+ """
78
+ b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
79
+
80
+ # compute query, key, value
81
+ q, k, v = self.to_qkv(x).view(b, s, 3, n, d).unbind(2)
82
+
83
+ # compute attention
84
+ p = self.attn_dropout if self.training else 0.0
85
+ x = flash_attention(q, k, v, dropout_p=p, causal=self.causal, version=2)
86
+ x = x.reshape(b, s, c)
87
+
88
+ # output
89
+ x = self.proj(x)
90
+ x = F.dropout(x, self.proj_dropout, self.training)
91
+ return x
92
+
93
+
94
+ class SwiGLU(nn.Module):
95
+
96
+ def __init__(self, dim, mid_dim):
97
+ super().__init__()
98
+ self.dim = dim
99
+ self.mid_dim = mid_dim
100
+
101
+ # layers
102
+ self.fc1 = nn.Linear(dim, mid_dim)
103
+ self.fc2 = nn.Linear(dim, mid_dim)
104
+ self.fc3 = nn.Linear(mid_dim, dim)
105
+
106
+ def forward(self, x):
107
+ x = F.silu(self.fc1(x)) * self.fc2(x)
108
+ x = self.fc3(x)
109
+ return x
110
+
111
+
112
+ class AttentionBlock(nn.Module):
113
+
114
+ def __init__(self,
115
+ dim,
116
+ mlp_ratio,
117
+ num_heads,
118
+ post_norm=False,
119
+ causal=False,
120
+ activation='quick_gelu',
121
+ attn_dropout=0.0,
122
+ proj_dropout=0.0,
123
+ norm_eps=1e-5):
124
+ assert activation in ['quick_gelu', 'gelu', 'swi_glu']
125
+ super().__init__()
126
+ self.dim = dim
127
+ self.mlp_ratio = mlp_ratio
128
+ self.num_heads = num_heads
129
+ self.post_norm = post_norm
130
+ self.causal = causal
131
+ self.norm_eps = norm_eps
132
+
133
+ # layers
134
+ self.norm1 = LayerNorm(dim, eps=norm_eps)
135
+ self.attn = SelfAttention(dim, num_heads, causal, attn_dropout,
136
+ proj_dropout)
137
+ self.norm2 = LayerNorm(dim, eps=norm_eps)
138
+ if activation == 'swi_glu':
139
+ self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
140
+ else:
141
+ self.mlp = nn.Sequential(
142
+ nn.Linear(dim, int(dim * mlp_ratio)),
143
+ QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
144
+ nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
145
+
146
+ def forward(self, x):
147
+ if self.post_norm:
148
+ x = x + self.norm1(self.attn(x))
149
+ x = x + self.norm2(self.mlp(x))
150
+ else:
151
+ x = x + self.attn(self.norm1(x))
152
+ x = x + self.mlp(self.norm2(x))
153
+ return x
154
+
155
+
156
+ class AttentionPool(nn.Module):
157
+
158
+ def __init__(self,
159
+ dim,
160
+ mlp_ratio,
161
+ num_heads,
162
+ activation='gelu',
163
+ proj_dropout=0.0,
164
+ norm_eps=1e-5):
165
+ assert dim % num_heads == 0
166
+ super().__init__()
167
+ self.dim = dim
168
+ self.mlp_ratio = mlp_ratio
169
+ self.num_heads = num_heads
170
+ self.head_dim = dim // num_heads
171
+ self.proj_dropout = proj_dropout
172
+ self.norm_eps = norm_eps
173
+
174
+ # layers
175
+ gain = 1.0 / math.sqrt(dim)
176
+ self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
177
+ self.to_q = nn.Linear(dim, dim)
178
+ self.to_kv = nn.Linear(dim, dim * 2)
179
+ self.proj = nn.Linear(dim, dim)
180
+ self.norm = LayerNorm(dim, eps=norm_eps)
181
+ self.mlp = nn.Sequential(
182
+ nn.Linear(dim, int(dim * mlp_ratio)),
183
+ QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
184
+ nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
185
+
186
+ def forward(self, x):
187
+ """
188
+ x: [B, L, C].
189
+ """
190
+ b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
191
+
192
+ # compute query, key, value
193
+ q = self.to_q(self.cls_embedding).view(1, 1, n, d).expand(b, -1, -1, -1)
194
+ k, v = self.to_kv(x).view(b, s, 2, n, d).unbind(2)
195
+
196
+ # compute attention
197
+ x = flash_attention(q, k, v, version=2)
198
+ x = x.reshape(b, 1, c)
199
+
200
+ # output
201
+ x = self.proj(x)
202
+ x = F.dropout(x, self.proj_dropout, self.training)
203
+
204
+ # mlp
205
+ x = x + self.mlp(self.norm(x))
206
+ return x[:, 0]
207
+
208
+
209
+ class VisionTransformer(nn.Module):
210
+
211
+ def __init__(self,
212
+ image_size=224,
213
+ patch_size=16,
214
+ dim=768,
215
+ mlp_ratio=4,
216
+ out_dim=512,
217
+ num_heads=12,
218
+ num_layers=12,
219
+ pool_type='token',
220
+ pre_norm=True,
221
+ post_norm=False,
222
+ activation='quick_gelu',
223
+ attn_dropout=0.0,
224
+ proj_dropout=0.0,
225
+ embedding_dropout=0.0,
226
+ norm_eps=1e-5):
227
+ if image_size % patch_size != 0:
228
+ print(
229
+ '[WARNING] image_size is not divisible by patch_size',
230
+ flush=True)
231
+ assert pool_type in ('token', 'token_fc', 'attn_pool')
232
+ out_dim = out_dim or dim
233
+ super().__init__()
234
+ self.image_size = image_size
235
+ self.patch_size = patch_size
236
+ self.num_patches = (image_size // patch_size)**2
237
+ self.dim = dim
238
+ self.mlp_ratio = mlp_ratio
239
+ self.out_dim = out_dim
240
+ self.num_heads = num_heads
241
+ self.num_layers = num_layers
242
+ self.pool_type = pool_type
243
+ self.post_norm = post_norm
244
+ self.norm_eps = norm_eps
245
+
246
+ # embeddings
247
+ gain = 1.0 / math.sqrt(dim)
248
+ self.patch_embedding = nn.Conv2d(
249
+ 3,
250
+ dim,
251
+ kernel_size=patch_size,
252
+ stride=patch_size,
253
+ bias=not pre_norm)
254
+ if pool_type in ('token', 'token_fc'):
255
+ self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
256
+ self.pos_embedding = nn.Parameter(gain * torch.randn(
257
+ 1, self.num_patches +
258
+ (1 if pool_type in ('token', 'token_fc') else 0), dim))
259
+ self.dropout = nn.Dropout(embedding_dropout)
260
+
261
+ # transformer
262
+ self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
263
+ self.transformer = nn.Sequential(*[
264
+ AttentionBlock(dim, mlp_ratio, num_heads, post_norm, False,
265
+ activation, attn_dropout, proj_dropout, norm_eps)
266
+ for _ in range(num_layers)
267
+ ])
268
+ self.post_norm = LayerNorm(dim, eps=norm_eps)
269
+
270
+ # head
271
+ if pool_type == 'token':
272
+ self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
273
+ elif pool_type == 'token_fc':
274
+ self.head = nn.Linear(dim, out_dim)
275
+ elif pool_type == 'attn_pool':
276
+ self.head = AttentionPool(dim, mlp_ratio, num_heads, activation,
277
+ proj_dropout, norm_eps)
278
+
279
+ def forward(self, x, interpolation=False, use_31_block=False):
280
+ b = x.size(0)
281
+
282
+ # embeddings
283
+ x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
284
+ if self.pool_type in ('token', 'token_fc'):
285
+ x = torch.cat([self.cls_embedding.expand(b, -1, -1), x], dim=1)
286
+ if interpolation:
287
+ e = pos_interpolate(self.pos_embedding, x.size(1))
288
+ else:
289
+ e = self.pos_embedding
290
+ x = self.dropout(x + e)
291
+ if self.pre_norm is not None:
292
+ x = self.pre_norm(x)
293
+
294
+ # transformer
295
+ if use_31_block:
296
+ x = self.transformer[:-1](x)
297
+ return x
298
+ else:
299
+ x = self.transformer(x)
300
+ return x
301
+
302
+
303
+ class XLMRobertaWithHead(XLMRoberta):
304
+
305
+ def __init__(self, **kwargs):
306
+ self.out_dim = kwargs.pop('out_dim')
307
+ super().__init__(**kwargs)
308
+
309
+ # head
310
+ mid_dim = (self.dim + self.out_dim) // 2
311
+ self.head = nn.Sequential(
312
+ nn.Linear(self.dim, mid_dim, bias=False), nn.GELU(),
313
+ nn.Linear(mid_dim, self.out_dim, bias=False))
314
+
315
+ def forward(self, ids):
316
+ # xlm-roberta
317
+ x = super().forward(ids)
318
+
319
+ # average pooling
320
+ mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
321
+ x = (x * mask).sum(dim=1) / mask.sum(dim=1)
322
+
323
+ # head
324
+ x = self.head(x)
325
+ return x
326
+
327
+
328
+ class XLMRobertaCLIP(nn.Module):
329
+
330
+ def __init__(self,
331
+ embed_dim=1024,
332
+ image_size=224,
333
+ patch_size=14,
334
+ vision_dim=1280,
335
+ vision_mlp_ratio=4,
336
+ vision_heads=16,
337
+ vision_layers=32,
338
+ vision_pool='token',
339
+ vision_pre_norm=True,
340
+ vision_post_norm=False,
341
+ activation='gelu',
342
+ vocab_size=250002,
343
+ max_text_len=514,
344
+ type_size=1,
345
+ pad_id=1,
346
+ text_dim=1024,
347
+ text_heads=16,
348
+ text_layers=24,
349
+ text_post_norm=True,
350
+ text_dropout=0.1,
351
+ attn_dropout=0.0,
352
+ proj_dropout=0.0,
353
+ embedding_dropout=0.0,
354
+ norm_eps=1e-5):
355
+ super().__init__()
356
+ self.embed_dim = embed_dim
357
+ self.image_size = image_size
358
+ self.patch_size = patch_size
359
+ self.vision_dim = vision_dim
360
+ self.vision_mlp_ratio = vision_mlp_ratio
361
+ self.vision_heads = vision_heads
362
+ self.vision_layers = vision_layers
363
+ self.vision_pre_norm = vision_pre_norm
364
+ self.vision_post_norm = vision_post_norm
365
+ self.activation = activation
366
+ self.vocab_size = vocab_size
367
+ self.max_text_len = max_text_len
368
+ self.type_size = type_size
369
+ self.pad_id = pad_id
370
+ self.text_dim = text_dim
371
+ self.text_heads = text_heads
372
+ self.text_layers = text_layers
373
+ self.text_post_norm = text_post_norm
374
+ self.norm_eps = norm_eps
375
+
376
+ # models
377
+ self.visual = VisionTransformer(
378
+ image_size=image_size,
379
+ patch_size=patch_size,
380
+ dim=vision_dim,
381
+ mlp_ratio=vision_mlp_ratio,
382
+ out_dim=embed_dim,
383
+ num_heads=vision_heads,
384
+ num_layers=vision_layers,
385
+ pool_type=vision_pool,
386
+ pre_norm=vision_pre_norm,
387
+ post_norm=vision_post_norm,
388
+ activation=activation,
389
+ attn_dropout=attn_dropout,
390
+ proj_dropout=proj_dropout,
391
+ embedding_dropout=embedding_dropout,
392
+ norm_eps=norm_eps)
393
+ self.textual = XLMRobertaWithHead(
394
+ vocab_size=vocab_size,
395
+ max_seq_len=max_text_len,
396
+ type_size=type_size,
397
+ pad_id=pad_id,
398
+ dim=text_dim,
399
+ out_dim=embed_dim,
400
+ num_heads=text_heads,
401
+ num_layers=text_layers,
402
+ post_norm=text_post_norm,
403
+ dropout=text_dropout)
404
+ self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
405
+
406
+ def forward(self, imgs, txt_ids):
407
+ """
408
+ imgs: [B, 3, H, W] of torch.float32.
409
+ - mean: [0.48145466, 0.4578275, 0.40821073]
410
+ - std: [0.26862954, 0.26130258, 0.27577711]
411
+ txt_ids: [B, L] of torch.long.
412
+ Encoded by data.CLIPTokenizer.
413
+ """
414
+ xi = self.visual(imgs)
415
+ xt = self.textual(txt_ids)
416
+ return xi, xt
417
+
418
+ def param_groups(self):
419
+ groups = [{
420
+ 'params': [
421
+ p for n, p in self.named_parameters()
422
+ if 'norm' in n or n.endswith('bias')
423
+ ],
424
+ 'weight_decay': 0.0
425
+ }, {
426
+ 'params': [
427
+ p for n, p in self.named_parameters()
428
+ if not ('norm' in n or n.endswith('bias'))
429
+ ]
430
+ }]
431
+ return groups
432
+
433
+
434
+ def _clip(pretrained=False,
435
+ pretrained_name=None,
436
+ model_cls=XLMRobertaCLIP,
437
+ return_transforms=False,
438
+ return_tokenizer=False,
439
+ tokenizer_padding='eos',
440
+ dtype=torch.float32,
441
+ device='cpu',
442
+ **kwargs):
443
+ # init a model on device
444
+ with torch.device(device):
445
+ model = model_cls(**kwargs)
446
+
447
+ # set device
448
+ model = model.to(dtype=dtype, device=device)
449
+ output = (model,)
450
+
451
+ # init transforms
452
+ if return_transforms:
453
+ # mean and std
454
+ if 'siglip' in pretrained_name.lower():
455
+ mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
456
+ else:
457
+ mean = [0.48145466, 0.4578275, 0.40821073]
458
+ std = [0.26862954, 0.26130258, 0.27577711]
459
+
460
+ # transforms
461
+ transforms = T.Compose([
462
+ T.Resize((model.image_size, model.image_size),
463
+ interpolation=T.InterpolationMode.BICUBIC),
464
+ T.ToTensor(),
465
+ T.Normalize(mean=mean, std=std)
466
+ ])
467
+ output += (transforms,)
468
+ return output[0] if len(output) == 1 else output
469
+
470
+
471
+ def clip_xlm_roberta_vit_h_14(
472
+ pretrained=False,
473
+ pretrained_name='open-clip-xlm-roberta-large-vit-huge-14',
474
+ **kwargs):
475
+ cfg = dict(
476
+ embed_dim=1024,
477
+ image_size=224,
478
+ patch_size=14,
479
+ vision_dim=1280,
480
+ vision_mlp_ratio=4,
481
+ vision_heads=16,
482
+ vision_layers=32,
483
+ vision_pool='token',
484
+ activation='gelu',
485
+ vocab_size=250002,
486
+ max_text_len=514,
487
+ type_size=1,
488
+ pad_id=1,
489
+ text_dim=1024,
490
+ text_heads=16,
491
+ text_layers=24,
492
+ text_post_norm=True,
493
+ text_dropout=0.1,
494
+ attn_dropout=0.0,
495
+ proj_dropout=0.0,
496
+ embedding_dropout=0.0)
497
+ cfg.update(**kwargs)
498
+ return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
499
+
500
+
501
+ class CLIPModel:
502
+
503
+ def __init__(self, dtype, device, checkpoint_path, tokenizer_path):
504
+ self.dtype = dtype
505
+ self.device = device
506
+ self.checkpoint_path = checkpoint_path
507
+ self.tokenizer_path = tokenizer_path
508
+
509
+ # init model
510
+ self.model, self.transforms = clip_xlm_roberta_vit_h_14(
511
+ pretrained=False,
512
+ return_transforms=True,
513
+ return_tokenizer=False,
514
+ dtype=dtype,
515
+ device=device)
516
+ self.model = self.model.eval().requires_grad_(False)
517
+ logging.info(f'loading {checkpoint_path}')
518
+ self.model.load_state_dict(
519
+ torch.load(checkpoint_path, map_location='cpu'))
520
+
521
+ # init tokenizer
522
+ self.tokenizer = HuggingfaceTokenizer(
523
+ name=tokenizer_path,
524
+ seq_len=self.model.max_text_len - 2,
525
+ clean='whitespace')
526
+
527
+ def visual(self, videos):
528
+ # preprocess
529
+ size = (self.model.image_size,) * 2
530
+ videos = torch.cat([
531
+ F.interpolate(
532
+ u.transpose(0, 1),
533
+ size=size,
534
+ mode='bicubic',
535
+ align_corners=False) for u in videos
536
+ ])
537
+ videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))
538
+
539
+ # forward
540
+ with torch.cuda.amp.autocast(dtype=self.dtype):
541
+ out = self.model.visual(videos, use_31_block=True)
542
+ return out
wan/modules/model.py ADDED
@@ -0,0 +1,631 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import math
3
+
4
+ import torch
5
+ import torch.cuda.amp as amp
6
+ import torch.nn as nn
7
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
8
+ from diffusers.models.modeling_utils import ModelMixin
9
+
10
+ from .attention import flash_attention
11
+
12
+ __all__ = ['WanModel']
13
+
14
+ T5_CONTEXT_TOKEN_NUMBER = 512
15
+ FIRST_LAST_FRAME_CONTEXT_TOKEN_NUMBER = 257 * 2
16
+
17
+
18
+ def sinusoidal_embedding_1d(dim, position):
19
+ # preprocess
20
+ assert dim % 2 == 0
21
+ half = dim // 2
22
+ position = position.type(torch.float64)
23
+
24
+ # calculation
25
+ sinusoid = torch.outer(
26
+ position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
27
+ x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
28
+ return x
29
+
30
+
31
+ @amp.autocast(enabled=False)
32
+ def rope_params(max_seq_len, dim, theta=10000):
33
+ assert dim % 2 == 0
34
+ freqs = torch.outer(
35
+ torch.arange(max_seq_len),
36
+ 1.0 / torch.pow(theta,
37
+ torch.arange(0, dim, 2).to(torch.float64).div(dim)))
38
+ freqs = torch.polar(torch.ones_like(freqs), freqs)
39
+ return freqs
40
+
41
+
42
+ @amp.autocast(enabled=False)
43
+ def rope_apply(x, grid_sizes, freqs):
44
+ n, c = x.size(2), x.size(3) // 2
45
+
46
+ # split freqs
47
+ freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
48
+
49
+ # loop over samples
50
+ output = []
51
+ for i, (f, h, w) in enumerate(grid_sizes.tolist()):
52
+ seq_len = f * h * w
53
+
54
+ # precompute multipliers
55
+ x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float64).reshape(
56
+ seq_len, n, -1, 2))
57
+ freqs_i = torch.cat([
58
+ freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
59
+ freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
60
+ freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
61
+ ],
62
+ dim=-1).reshape(seq_len, 1, -1)
63
+
64
+ # apply rotary embedding
65
+ x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
66
+ x_i = torch.cat([x_i, x[i, seq_len:]])
67
+
68
+ # append to collection
69
+ output.append(x_i)
70
+ return torch.stack(output).float()
71
+
72
+
73
+ class WanRMSNorm(nn.Module):
74
+
75
+ def __init__(self, dim, eps=1e-5):
76
+ super().__init__()
77
+ self.dim = dim
78
+ self.eps = eps
79
+ self.weight = nn.Parameter(torch.ones(dim))
80
+
81
+ def forward(self, x):
82
+ r"""
83
+ Args:
84
+ x(Tensor): Shape [B, L, C]
85
+ """
86
+ return self._norm(x.float()).type_as(x) * self.weight
87
+
88
+ def _norm(self, x):
89
+ return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
90
+
91
+
92
+ class WanLayerNorm(nn.LayerNorm):
93
+
94
+ def __init__(self, dim, eps=1e-6, elementwise_affine=False):
95
+ super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
96
+
97
+ def forward(self, x):
98
+ r"""
99
+ Args:
100
+ x(Tensor): Shape [B, L, C]
101
+ """
102
+ return super().forward(x.float()).type_as(x)
103
+
104
+
105
+ class WanSelfAttention(nn.Module):
106
+
107
+ def __init__(self,
108
+ dim,
109
+ num_heads,
110
+ window_size=(-1, -1),
111
+ qk_norm=True,
112
+ eps=1e-6):
113
+ assert dim % num_heads == 0
114
+ super().__init__()
115
+ self.dim = dim
116
+ self.num_heads = num_heads
117
+ self.head_dim = dim // num_heads
118
+ self.window_size = window_size
119
+ self.qk_norm = qk_norm
120
+ self.eps = eps
121
+
122
+ # layers
123
+ self.q = nn.Linear(dim, dim)
124
+ self.k = nn.Linear(dim, dim)
125
+ self.v = nn.Linear(dim, dim)
126
+ self.o = nn.Linear(dim, dim)
127
+ self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
128
+ self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
129
+
130
+ def forward(self, x, seq_lens, grid_sizes, freqs):
131
+ r"""
132
+ Args:
133
+ x(Tensor): Shape [B, L, num_heads, C / num_heads]
134
+ seq_lens(Tensor): Shape [B]
135
+ grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
136
+ freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
137
+ """
138
+ b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
139
+
140
+ # query, key, value function
141
+ def qkv_fn(x):
142
+ q = self.norm_q(self.q(x)).view(b, s, n, d)
143
+ k = self.norm_k(self.k(x)).view(b, s, n, d)
144
+ v = self.v(x).view(b, s, n, d)
145
+ return q, k, v
146
+
147
+ q, k, v = qkv_fn(x)
148
+
149
+ x = flash_attention(
150
+ q=rope_apply(q, grid_sizes, freqs),
151
+ k=rope_apply(k, grid_sizes, freqs),
152
+ v=v,
153
+ k_lens=seq_lens,
154
+ window_size=self.window_size)
155
+
156
+ # output
157
+ x = x.flatten(2)
158
+ x = self.o(x)
159
+ return x
160
+
161
+
162
+ class WanT2VCrossAttention(WanSelfAttention):
163
+
164
+ def forward(self, x, context, context_lens):
165
+ r"""
166
+ Args:
167
+ x(Tensor): Shape [B, L1, C]
168
+ context(Tensor): Shape [B, L2, C]
169
+ context_lens(Tensor): Shape [B]
170
+ """
171
+ b, n, d = x.size(0), self.num_heads, self.head_dim
172
+
173
+ # compute query, key, value
174
+ q = self.norm_q(self.q(x)).view(b, -1, n, d)
175
+ k = self.norm_k(self.k(context)).view(b, -1, n, d)
176
+ v = self.v(context).view(b, -1, n, d)
177
+
178
+ # compute attention
179
+ x = flash_attention(q, k, v, k_lens=context_lens)
180
+
181
+ # output
182
+ x = x.flatten(2)
183
+ x = self.o(x)
184
+ return x
185
+
186
+
187
+ class WanI2VCrossAttention(WanSelfAttention):
188
+
189
+ def __init__(self,
190
+ dim,
191
+ num_heads,
192
+ window_size=(-1, -1),
193
+ qk_norm=True,
194
+ eps=1e-6):
195
+ super().__init__(dim, num_heads, window_size, qk_norm, eps)
196
+
197
+ self.k_img = nn.Linear(dim, dim)
198
+ self.v_img = nn.Linear(dim, dim)
199
+ # self.alpha = nn.Parameter(torch.zeros((1, )))
200
+ self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
201
+
202
+ def forward(self, x, context, context_lens):
203
+ r"""
204
+ Args:
205
+ x(Tensor): Shape [B, L1, C]
206
+ context(Tensor): Shape [B, L2, C]
207
+ context_lens(Tensor): Shape [B]
208
+ """
209
+ image_context_length = context.shape[1] - T5_CONTEXT_TOKEN_NUMBER
210
+ context_img = context[:, :image_context_length]
211
+ context = context[:, image_context_length:]
212
+ b, n, d = x.size(0), self.num_heads, self.head_dim
213
+
214
+ # compute query, key, value
215
+ q = self.norm_q(self.q(x)).view(b, -1, n, d)
216
+ k = self.norm_k(self.k(context)).view(b, -1, n, d)
217
+ v = self.v(context).view(b, -1, n, d)
218
+ k_img = self.norm_k_img(self.k_img(context_img)).view(b, -1, n, d)
219
+ v_img = self.v_img(context_img).view(b, -1, n, d)
220
+ img_x = flash_attention(q, k_img, v_img, k_lens=None)
221
+ # compute attention
222
+ x = flash_attention(q, k, v, k_lens=context_lens)
223
+
224
+ # output
225
+ x = x.flatten(2)
226
+ img_x = img_x.flatten(2)
227
+ x = x + img_x
228
+ x = self.o(x)
229
+ return x
230
+
231
+
232
+ WAN_CROSSATTENTION_CLASSES = {
233
+ 't2v_cross_attn': WanT2VCrossAttention,
234
+ 'i2v_cross_attn': WanI2VCrossAttention,
235
+ }
236
+
237
+
238
+ class WanAttentionBlock(nn.Module):
239
+
240
+ def __init__(self,
241
+ cross_attn_type,
242
+ dim,
243
+ ffn_dim,
244
+ num_heads,
245
+ window_size=(-1, -1),
246
+ qk_norm=True,
247
+ cross_attn_norm=False,
248
+ eps=1e-6):
249
+ super().__init__()
250
+ self.dim = dim
251
+ self.ffn_dim = ffn_dim
252
+ self.num_heads = num_heads
253
+ self.window_size = window_size
254
+ self.qk_norm = qk_norm
255
+ self.cross_attn_norm = cross_attn_norm
256
+ self.eps = eps
257
+
258
+ # layers
259
+ self.norm1 = WanLayerNorm(dim, eps)
260
+ self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm,
261
+ eps)
262
+ self.norm3 = WanLayerNorm(
263
+ dim, eps,
264
+ elementwise_affine=True) if cross_attn_norm else nn.Identity()
265
+ self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](dim,
266
+ num_heads,
267
+ (-1, -1),
268
+ qk_norm,
269
+ eps)
270
+ self.norm2 = WanLayerNorm(dim, eps)
271
+ self.ffn = nn.Sequential(
272
+ nn.Linear(dim, ffn_dim), nn.GELU(approximate='tanh'),
273
+ nn.Linear(ffn_dim, dim))
274
+
275
+ # modulation
276
+ self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
277
+
278
+ def forward(
279
+ self,
280
+ x,
281
+ e,
282
+ seq_lens,
283
+ grid_sizes,
284
+ freqs,
285
+ context,
286
+ context_lens,
287
+ ):
288
+ r"""
289
+ Args:
290
+ x(Tensor): Shape [B, L, C]
291
+ e(Tensor): Shape [B, 6, C]
292
+ seq_lens(Tensor): Shape [B], length of each sequence in batch
293
+ grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
294
+ freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
295
+ """
296
+ assert e.dtype == torch.float32
297
+ with amp.autocast(dtype=torch.float32):
298
+ e = (self.modulation.to(e.device) + e).chunk(6, dim=1)
299
+ assert e[0].dtype == torch.float32
300
+
301
+ # self-attention
302
+ y = self.self_attn(
303
+ self.norm1(x).float() * (1 + e[1]) + e[0], seq_lens, grid_sizes,
304
+ freqs)
305
+ with amp.autocast(dtype=torch.float32):
306
+ x = x + y * e[2]
307
+
308
+ # cross-attention & ffn function
309
+ def cross_attn_ffn(x, context, context_lens, e):
310
+ x = x + self.cross_attn(self.norm3(x), context, context_lens)
311
+ y = self.ffn(self.norm2(x).float() * (1 + e[4]) + e[3])
312
+ with amp.autocast(dtype=torch.float32):
313
+ x = x + y * e[5]
314
+ return x
315
+
316
+ x = cross_attn_ffn(x, context, context_lens, e)
317
+ return x
318
+
319
+
320
+ class Head(nn.Module):
321
+
322
+ def __init__(self, dim, out_dim, patch_size, eps=1e-6):
323
+ super().__init__()
324
+ self.dim = dim
325
+ self.out_dim = out_dim
326
+ self.patch_size = patch_size
327
+ self.eps = eps
328
+
329
+ # layers
330
+ out_dim = math.prod(patch_size) * out_dim
331
+ self.norm = WanLayerNorm(dim, eps)
332
+ self.head = nn.Linear(dim, out_dim)
333
+
334
+ # modulation
335
+ self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
336
+
337
+ def forward(self, x, e):
338
+ r"""
339
+ Args:
340
+ x(Tensor): Shape [B, L1, C]
341
+ e(Tensor): Shape [B, C]
342
+ """
343
+ assert e.dtype == torch.float32
344
+ with amp.autocast(dtype=torch.float32):
345
+ e = (self.modulation.to(e.device) + e.unsqueeze(1)).chunk(2, dim=1)
346
+ x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
347
+ return x
348
+
349
+
350
+ class MLPProj(torch.nn.Module):
351
+
352
+ def __init__(self, in_dim, out_dim, flf_pos_emb=False):
353
+ super().__init__()
354
+
355
+ self.proj = torch.nn.Sequential(
356
+ torch.nn.LayerNorm(in_dim), torch.nn.Linear(in_dim, in_dim),
357
+ torch.nn.GELU(), torch.nn.Linear(in_dim, out_dim),
358
+ torch.nn.LayerNorm(out_dim))
359
+ if flf_pos_emb: # NOTE: we only use this for `flf2v`
360
+ self.emb_pos = nn.Parameter(
361
+ torch.zeros(1, FIRST_LAST_FRAME_CONTEXT_TOKEN_NUMBER, 1280))
362
+
363
+ def forward(self, image_embeds):
364
+ if hasattr(self, 'emb_pos'):
365
+ bs, n, d = image_embeds.shape
366
+ image_embeds = image_embeds.view(-1, 2 * n, d)
367
+ image_embeds = image_embeds + self.emb_pos
368
+ clip_extra_context_tokens = self.proj(image_embeds)
369
+ return clip_extra_context_tokens
370
+
371
+
372
+ class WanModel(ModelMixin, ConfigMixin):
373
+ r"""
374
+ Wan diffusion backbone supporting both text-to-video and image-to-video.
375
+ """
376
+
377
+ ignore_for_config = [
378
+ 'patch_size', 'cross_attn_norm', 'qk_norm', 'text_dim', 'window_size'
379
+ ]
380
+ _no_split_modules = ['WanAttentionBlock']
381
+
382
+ @register_to_config
383
+ def __init__(self,
384
+ model_type='t2v',
385
+ patch_size=(1, 2, 2),
386
+ text_len=512,
387
+ in_dim=16,
388
+ dim=2048,
389
+ ffn_dim=8192,
390
+ freq_dim=256,
391
+ text_dim=4096,
392
+ out_dim=16,
393
+ num_heads=16,
394
+ num_layers=32,
395
+ window_size=(-1, -1),
396
+ qk_norm=True,
397
+ cross_attn_norm=True,
398
+ eps=1e-6):
399
+ r"""
400
+ Initialize the diffusion model backbone.
401
+
402
+ Args:
403
+ model_type (`str`, *optional*, defaults to 't2v'):
404
+ Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video) or 'flf2v' (first-last-frame-to-video) or 'vace'
405
+ patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
406
+ 3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
407
+ text_len (`int`, *optional*, defaults to 512):
408
+ Fixed length for text embeddings
409
+ in_dim (`int`, *optional*, defaults to 16):
410
+ Input video channels (C_in)
411
+ dim (`int`, *optional*, defaults to 2048):
412
+ Hidden dimension of the transformer
413
+ ffn_dim (`int`, *optional*, defaults to 8192):
414
+ Intermediate dimension in feed-forward network
415
+ freq_dim (`int`, *optional*, defaults to 256):
416
+ Dimension for sinusoidal time embeddings
417
+ text_dim (`int`, *optional*, defaults to 4096):
418
+ Input dimension for text embeddings
419
+ out_dim (`int`, *optional*, defaults to 16):
420
+ Output video channels (C_out)
421
+ num_heads (`int`, *optional*, defaults to 16):
422
+ Number of attention heads
423
+ num_layers (`int`, *optional*, defaults to 32):
424
+ Number of transformer blocks
425
+ window_size (`tuple`, *optional*, defaults to (-1, -1)):
426
+ Window size for local attention (-1 indicates global attention)
427
+ qk_norm (`bool`, *optional*, defaults to True):
428
+ Enable query/key normalization
429
+ cross_attn_norm (`bool`, *optional*, defaults to False):
430
+ Enable cross-attention normalization
431
+ eps (`float`, *optional*, defaults to 1e-6):
432
+ Epsilon value for normalization layers
433
+ """
434
+
435
+ super().__init__()
436
+
437
+ assert model_type in ['t2v', 'i2v', 'flf2v', 'vace']
438
+ self.model_type = model_type
439
+
440
+ self.patch_size = patch_size
441
+ self.text_len = text_len
442
+ self.in_dim = in_dim
443
+ self.dim = dim
444
+ self.ffn_dim = ffn_dim
445
+ self.freq_dim = freq_dim
446
+ self.text_dim = text_dim
447
+ self.out_dim = out_dim
448
+ self.num_heads = num_heads
449
+ self.num_layers = num_layers
450
+ self.window_size = window_size
451
+ self.qk_norm = qk_norm
452
+ self.cross_attn_norm = cross_attn_norm
453
+ self.eps = eps
454
+
455
+ # embeddings
456
+ self.patch_embedding = nn.Conv3d(
457
+ in_dim, dim, kernel_size=patch_size, stride=patch_size)
458
+ self.text_embedding = nn.Sequential(
459
+ nn.Linear(text_dim, dim), nn.GELU(approximate='tanh'),
460
+ nn.Linear(dim, dim))
461
+
462
+ self.time_embedding = nn.Sequential(
463
+ nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
464
+ self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
465
+
466
+ # blocks
467
+ cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
468
+ self.blocks = nn.ModuleList([
469
+ WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
470
+ window_size, qk_norm, cross_attn_norm, eps)
471
+ for _ in range(num_layers)
472
+ ])
473
+
474
+ # head
475
+ self.head = Head(dim, out_dim, patch_size, eps)
476
+
477
+ # buffers (don't use register_buffer otherwise dtype will be changed in to())
478
+ assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
479
+ d = dim // num_heads
480
+ self.freqs = torch.cat([
481
+ rope_params(1024, d - 4 * (d // 6)),
482
+ rope_params(1024, 2 * (d // 6)),
483
+ rope_params(1024, 2 * (d // 6))
484
+ ],
485
+ dim=1)
486
+
487
+ if model_type == 'i2v' or model_type == 'flf2v':
488
+ self.img_emb = MLPProj(1280, dim, flf_pos_emb=model_type == 'flf2v')
489
+
490
+ # initialize weights
491
+ self.init_weights()
492
+
493
+ def forward(
494
+ self,
495
+ x,
496
+ t,
497
+ context,
498
+ seq_len,
499
+ clip_fea=None,
500
+ y=None,
501
+ ):
502
+ r"""
503
+ Forward pass through the diffusion model
504
+
505
+ Args:
506
+ x (List[Tensor]):
507
+ List of input video tensors, each with shape [C_in, F, H, W]
508
+ t (Tensor):
509
+ Diffusion timesteps tensor of shape [B]
510
+ context (List[Tensor]):
511
+ List of text embeddings each with shape [L, C]
512
+ seq_len (`int`):
513
+ Maximum sequence length for positional encoding
514
+ clip_fea (Tensor, *optional*):
515
+ CLIP image features for image-to-video mode or first-last-frame-to-video mode
516
+ y (List[Tensor], *optional*):
517
+ Conditional video inputs for image-to-video mode, same shape as x
518
+
519
+ Returns:
520
+ List[Tensor]:
521
+ List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
522
+ """
523
+ if self.model_type == 'i2v' or self.model_type == 'flf2v':
524
+ assert clip_fea is not None and y is not None
525
+ # params
526
+ device = self.patch_embedding.weight.device
527
+ if self.freqs.device != device:
528
+ self.freqs = self.freqs.to(device)
529
+
530
+ if y is not None:
531
+ x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
532
+
533
+ # embeddings
534
+ x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
535
+ grid_sizes = torch.stack(
536
+ [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
537
+ x = [u.flatten(2).transpose(1, 2) for u in x]
538
+ seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
539
+ assert seq_lens.max() <= seq_len
540
+ x = torch.cat([
541
+ torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
542
+ dim=1) for u in x
543
+ ])
544
+
545
+ # time embeddings
546
+ with amp.autocast(dtype=torch.float32):
547
+ e = self.time_embedding(
548
+ sinusoidal_embedding_1d(self.freq_dim, t).float())
549
+ e0 = self.time_projection(e).unflatten(1, (6, self.dim))
550
+ assert e.dtype == torch.float32 and e0.dtype == torch.float32
551
+
552
+ # context
553
+ context_lens = None
554
+ context = self.text_embedding(
555
+ torch.stack([
556
+ torch.cat(
557
+ [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
558
+ for u in context
559
+ ]))
560
+
561
+ if clip_fea is not None:
562
+ context_clip = self.img_emb(clip_fea) # bs x 257 (x2) x dim
563
+ context = torch.concat([context_clip, context], dim=1)
564
+
565
+ # arguments
566
+ kwargs = dict(
567
+ e=e0,
568
+ seq_lens=seq_lens,
569
+ grid_sizes=grid_sizes,
570
+ freqs=self.freqs,
571
+ context=context,
572
+ context_lens=context_lens)
573
+
574
+ for block in self.blocks:
575
+ x = block(x, **kwargs)
576
+
577
+ # head
578
+ x = self.head(x, e)
579
+
580
+ # unpatchify
581
+ x = self.unpatchify(x, grid_sizes)
582
+ return [u.float() for u in x]
583
+
584
+ def unpatchify(self, x, grid_sizes):
585
+ r"""
586
+ Reconstruct video tensors from patch embeddings.
587
+
588
+ Args:
589
+ x (List[Tensor]):
590
+ List of patchified features, each with shape [L, C_out * prod(patch_size)]
591
+ grid_sizes (Tensor):
592
+ Original spatial-temporal grid dimensions before patching,
593
+ shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
594
+
595
+ Returns:
596
+ List[Tensor]:
597
+ Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
598
+ """
599
+
600
+ c = self.out_dim
601
+ out = []
602
+ for u, v in zip(x, grid_sizes.tolist()):
603
+ u = u[:math.prod(v)].view(*v, *self.patch_size, c)
604
+ u = torch.einsum('fhwpqrc->cfphqwr', u)
605
+ u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
606
+ out.append(u)
607
+ return out
608
+
609
+ def init_weights(self):
610
+ r"""
611
+ Initialize model parameters using Xavier initialization.
612
+ """
613
+
614
+ # basic init
615
+ for m in self.modules():
616
+ if isinstance(m, nn.Linear):
617
+ nn.init.xavier_uniform_(m.weight)
618
+ if m.bias is not None:
619
+ nn.init.zeros_(m.bias)
620
+
621
+ # init embeddings
622
+ nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
623
+ for m in self.text_embedding.modules():
624
+ if isinstance(m, nn.Linear):
625
+ nn.init.normal_(m.weight, std=.02)
626
+ for m in self.time_embedding.modules():
627
+ if isinstance(m, nn.Linear):
628
+ nn.init.normal_(m.weight, std=.02)
629
+
630
+ # init output layer
631
+ nn.init.zeros_(self.head.head.weight)
wan/modules/multitalk_model.py ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import math
3
+ import numpy as np
4
+ import os
5
+ import torch
6
+ import torch.cuda.amp as amp
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+ from einops import rearrange
11
+ from diffusers import ModelMixin
12
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
13
+
14
+ from .attention import flash_attention, SingleStreamMutiAttention
15
+ from ..utils.multitalk_utils import get_attn_map_with_target
16
+ import logging
17
+ try:
18
+ from sageattention import sageattn
19
+ USE_SAGEATTN = True
20
+ logging.info("Using sageattn")
21
+ except:
22
+ USE_SAGEATTN = False
23
+
24
+ __all__ = ['WanModel']
25
+
26
+
27
+
28
+ def sinusoidal_embedding_1d(dim, position):
29
+ # preprocess
30
+ assert dim % 2 == 0
31
+ half = dim // 2
32
+ position = position.type(torch.float64)
33
+
34
+ # calculation
35
+ sinusoid = torch.outer(
36
+ position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
37
+ x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
38
+ return x
39
+
40
+
41
+ @amp.autocast(enabled=False)
42
+ def rope_params(max_seq_len, dim, theta=10000):
43
+
44
+ assert dim % 2 == 0
45
+ freqs = torch.outer(
46
+ torch.arange(max_seq_len),
47
+ 1.0 / torch.pow(theta,
48
+ torch.arange(0, dim, 2).to(torch.float64).div(dim)))
49
+ freqs = torch.polar(torch.ones_like(freqs), freqs)
50
+ return freqs
51
+
52
+
53
+ @amp.autocast(enabled=False)
54
+ def rope_apply(x, grid_sizes, freqs):
55
+ s, n, c = x.size(1), x.size(2), x.size(3) // 2
56
+
57
+ freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
58
+
59
+ output = []
60
+ for i, (f, h, w) in enumerate(grid_sizes.tolist()):
61
+ seq_len = f * h * w
62
+
63
+ x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
64
+ s, n, -1, 2))
65
+ freqs_i = torch.cat([
66
+ freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
67
+ freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
68
+ freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
69
+ ],
70
+ dim=-1).reshape(seq_len, 1, -1)
71
+ freqs_i = freqs_i.to(device=x_i.device)
72
+ x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
73
+ x_i = torch.cat([x_i, x[i, seq_len:]])
74
+
75
+ output.append(x_i)
76
+ return torch.stack(output).float()
77
+
78
+
79
+ class WanRMSNorm(nn.Module):
80
+
81
+ def __init__(self, dim, eps=1e-5):
82
+ super().__init__()
83
+ self.dim = dim
84
+ self.eps = eps
85
+ self.weight = nn.Parameter(torch.ones(dim))
86
+
87
+ def forward(self, x):
88
+ r"""
89
+ Args:
90
+ x(Tensor): Shape [B, L, C]
91
+ """
92
+ return self._norm(x.float()).type_as(x) * self.weight
93
+
94
+ def _norm(self, x):
95
+ return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
96
+
97
+
98
+ class WanLayerNorm(nn.LayerNorm):
99
+
100
+ def __init__(self, dim, eps=1e-6, elementwise_affine=False):
101
+ super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
102
+
103
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
104
+ origin_dtype = inputs.dtype
105
+ out = F.layer_norm(
106
+ inputs.float(),
107
+ self.normalized_shape,
108
+ None if self.weight is None else self.weight.float(),
109
+ None if self.bias is None else self.bias.float() ,
110
+ self.eps
111
+ ).to(origin_dtype)
112
+ return out
113
+
114
+
115
+ class WanSelfAttention(nn.Module):
116
+
117
+ def __init__(self,
118
+ dim,
119
+ num_heads,
120
+ window_size=(-1, -1),
121
+ qk_norm=True,
122
+ eps=1e-6):
123
+ assert dim % num_heads == 0
124
+ super().__init__()
125
+ self.dim = dim
126
+ self.num_heads = num_heads
127
+ self.head_dim = dim // num_heads
128
+ self.window_size = window_size
129
+ self.qk_norm = qk_norm
130
+ self.eps = eps
131
+
132
+ # layers
133
+ self.q = nn.Linear(dim, dim)
134
+ self.k = nn.Linear(dim, dim)
135
+ self.v = nn.Linear(dim, dim)
136
+ self.o = nn.Linear(dim, dim)
137
+ self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
138
+ self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
139
+
140
+ def forward(self, x, seq_lens, grid_sizes, freqs, ref_target_masks=None):
141
+ b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
142
+
143
+ # query, key, value function
144
+ def qkv_fn(x):
145
+ q = self.norm_q(self.q(x)).view(b, s, n, d)
146
+ k = self.norm_k(self.k(x)).view(b, s, n, d)
147
+ v = self.v(x).view(b, s, n, d)
148
+ return q, k, v
149
+ q, k, v = qkv_fn(x)
150
+
151
+ q = rope_apply(q, grid_sizes, freqs)
152
+ k = rope_apply(k, grid_sizes, freqs)
153
+
154
+ if USE_SAGEATTN:
155
+ x = sageattn(q.to(torch.bfloat16), k.to(torch.bfloat16), v, tensor_layout='NHD')
156
+ else:
157
+ x = flash_attention(
158
+ q=q,
159
+ k=k,
160
+ v=v,
161
+ k_lens=seq_lens,
162
+ window_size=self.window_size
163
+ ).type_as(x)
164
+
165
+ # output
166
+ x = x.flatten(2)
167
+ x = self.o(x)
168
+ with torch.no_grad():
169
+ x_ref_attn_map = get_attn_map_with_target(q.type_as(x), k.type_as(x), grid_sizes[0],
170
+ ref_target_masks=ref_target_masks)
171
+
172
+ return x, x_ref_attn_map
173
+
174
+
175
+ class WanI2VCrossAttention(WanSelfAttention):
176
+
177
+ def __init__(self,
178
+ dim,
179
+ num_heads,
180
+ window_size=(-1, -1),
181
+ qk_norm=True,
182
+ eps=1e-6):
183
+ super().__init__(dim, num_heads, window_size, qk_norm, eps)
184
+
185
+ self.k_img = nn.Linear(dim, dim)
186
+ self.v_img = nn.Linear(dim, dim)
187
+ self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
188
+
189
+ def forward(self, x, context, context_lens):
190
+ context_img = context[:, :257]
191
+ context = context[:, 257:]
192
+ b, n, d = x.size(0), self.num_heads, self.head_dim
193
+
194
+ # compute query, key, value
195
+ q = self.norm_q(self.q(x)).view(b, -1, n, d)
196
+ k = self.norm_k(self.k(context)).view(b, -1, n, d)
197
+ v = self.v(context).view(b, -1, n, d)
198
+ k_img = self.norm_k_img(self.k_img(context_img)).view(b, -1, n, d)
199
+ v_img = self.v_img(context_img).view(b, -1, n, d)
200
+ if USE_SAGEATTN:
201
+ img_x = sageattn(q, k_img, v_img, tensor_layout='NHD')
202
+ x = sageattn(q, k, v, tensor_layout='NHD')
203
+ else:
204
+ img_x = flash_attention(q, k_img, v_img, k_lens=None)
205
+ # compute attention
206
+ x = flash_attention(q, k, v, k_lens=context_lens)
207
+
208
+ # output
209
+ x = x.flatten(2)
210
+ img_x = img_x.flatten(2)
211
+ x = x + img_x
212
+ x = self.o(x)
213
+ return x
214
+
215
+
216
+ class WanAttentionBlock(nn.Module):
217
+
218
+ def __init__(self,
219
+ cross_attn_type,
220
+ dim,
221
+ ffn_dim,
222
+ num_heads,
223
+ window_size=(-1, -1),
224
+ qk_norm=True,
225
+ cross_attn_norm=False,
226
+ eps=1e-6,
227
+ output_dim=768,
228
+ norm_input_visual=True,
229
+ class_range=24,
230
+ class_interval=4):
231
+ super().__init__()
232
+ self.dim = dim
233
+ self.ffn_dim = ffn_dim
234
+ self.num_heads = num_heads
235
+ self.window_size = window_size
236
+ self.qk_norm = qk_norm
237
+ self.cross_attn_norm = cross_attn_norm
238
+ self.eps = eps
239
+
240
+ # layers
241
+ self.norm1 = WanLayerNorm(dim, eps)
242
+ self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm, eps)
243
+ self.norm3 = WanLayerNorm(
244
+ dim, eps,
245
+ elementwise_affine=True) if cross_attn_norm else nn.Identity()
246
+ self.cross_attn = WanI2VCrossAttention(dim,
247
+ num_heads,
248
+ (-1, -1),
249
+ qk_norm,
250
+ eps)
251
+ self.norm2 = WanLayerNorm(dim, eps)
252
+ self.ffn = nn.Sequential(
253
+ nn.Linear(dim, ffn_dim), nn.GELU(approximate='tanh'),
254
+ nn.Linear(ffn_dim, dim))
255
+
256
+ # modulation
257
+ self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
258
+
259
+ # init audio module
260
+ self.audio_cross_attn = SingleStreamMutiAttention(
261
+ dim=dim,
262
+ encoder_hidden_states_dim=output_dim,
263
+ num_heads=num_heads,
264
+ qk_norm=False,
265
+ qkv_bias=True,
266
+ eps=eps,
267
+ norm_layer=WanRMSNorm,
268
+ class_range=class_range,
269
+ class_interval=class_interval
270
+ )
271
+ self.norm_x = WanLayerNorm(dim, eps, elementwise_affine=True) if norm_input_visual else nn.Identity()
272
+
273
+
274
+ def forward(
275
+ self,
276
+ x,
277
+ e,
278
+ seq_lens,
279
+ grid_sizes,
280
+ freqs,
281
+ context,
282
+ context_lens,
283
+ audio_embedding=None,
284
+ ref_target_masks=None,
285
+ human_num=None,
286
+ ):
287
+
288
+ dtype = x.dtype
289
+ assert e.dtype == torch.float32
290
+ with amp.autocast(dtype=torch.float32):
291
+ e = (self.modulation.to(e.device) + e).chunk(6, dim=1)
292
+ assert e[0].dtype == torch.float32
293
+
294
+ # self-attention
295
+ y, x_ref_attn_map = self.self_attn(
296
+ (self.norm1(x).float() * (1 + e[1]) + e[0]).type_as(x), seq_lens, grid_sizes,
297
+ freqs, ref_target_masks=ref_target_masks)
298
+ with amp.autocast(dtype=torch.float32):
299
+ x = x + y * e[2]
300
+
301
+ x = x.to(dtype)
302
+
303
+ # cross-attention of text
304
+ x = x + self.cross_attn(self.norm3(x), context, context_lens)
305
+
306
+ # cross attn of audio
307
+ x_a = self.audio_cross_attn(self.norm_x(x), encoder_hidden_states=audio_embedding,
308
+ shape=grid_sizes[0], x_ref_attn_map=x_ref_attn_map, human_num=human_num)
309
+ x = x + x_a
310
+
311
+ y = self.ffn((self.norm2(x).float() * (1 + e[4]) + e[3]).to(dtype))
312
+ with amp.autocast(dtype=torch.float32):
313
+ x = x + y * e[5]
314
+
315
+
316
+ x = x.to(dtype)
317
+
318
+ return x
319
+
320
+
321
+ class Head(nn.Module):
322
+
323
+ def __init__(self, dim, out_dim, patch_size, eps=1e-6):
324
+ super().__init__()
325
+ self.dim = dim
326
+ self.out_dim = out_dim
327
+ self.patch_size = patch_size
328
+ self.eps = eps
329
+
330
+ # layers
331
+ out_dim = math.prod(patch_size) * out_dim
332
+ self.norm = WanLayerNorm(dim, eps)
333
+ self.head = nn.Linear(dim, out_dim)
334
+
335
+ # modulation
336
+ self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
337
+
338
+ def forward(self, x, e):
339
+ r"""
340
+ Args:
341
+ x(Tensor): Shape [B, L1, C]
342
+ e(Tensor): Shape [B, C]
343
+ """
344
+ assert e.dtype == torch.float32
345
+ with amp.autocast(dtype=torch.float32):
346
+ e = (self.modulation.to(e.device) + e.unsqueeze(1)).chunk(2, dim=1)
347
+ x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
348
+ return x
349
+
350
+
351
+ class MLPProj(torch.nn.Module):
352
+
353
+ def __init__(self, in_dim, out_dim):
354
+ super().__init__()
355
+
356
+ self.proj = torch.nn.Sequential(
357
+ torch.nn.LayerNorm(in_dim), torch.nn.Linear(in_dim, in_dim),
358
+ torch.nn.GELU(), torch.nn.Linear(in_dim, out_dim),
359
+ torch.nn.LayerNorm(out_dim))
360
+
361
+ def forward(self, image_embeds):
362
+ clip_extra_context_tokens = self.proj(image_embeds)
363
+ return clip_extra_context_tokens
364
+
365
+
366
+ class AudioProjModel(ModelMixin, ConfigMixin):
367
+ def __init__(
368
+ self,
369
+ seq_len=5,
370
+ seq_len_vf=12,
371
+ blocks=12,
372
+ channels=768,
373
+ intermediate_dim=512,
374
+ output_dim=768,
375
+ context_tokens=32,
376
+ norm_output_audio=False,
377
+ ):
378
+ super().__init__()
379
+
380
+ self.seq_len = seq_len
381
+ self.blocks = blocks
382
+ self.channels = channels
383
+ self.input_dim = seq_len * blocks * channels
384
+ self.input_dim_vf = seq_len_vf * blocks * channels
385
+ self.intermediate_dim = intermediate_dim
386
+ self.context_tokens = context_tokens
387
+ self.output_dim = output_dim
388
+
389
+ # define multiple linear layers
390
+ self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
391
+ self.proj1_vf = nn.Linear(self.input_dim_vf, intermediate_dim)
392
+ self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
393
+ self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
394
+ self.norm = nn.LayerNorm(output_dim) if norm_output_audio else nn.Identity()
395
+
396
+ def forward(self, audio_embeds, audio_embeds_vf):
397
+ video_length = audio_embeds.shape[1] + audio_embeds_vf.shape[1]
398
+ B, _, _, S, C = audio_embeds.shape
399
+
400
+ # process audio of first frame
401
+ audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
402
+ batch_size, window_size, blocks, channels = audio_embeds.shape
403
+ audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
404
+
405
+ # process audio of latter frame
406
+ audio_embeds_vf = rearrange(audio_embeds_vf, "bz f w b c -> (bz f) w b c")
407
+ batch_size_vf, window_size_vf, blocks_vf, channels_vf = audio_embeds_vf.shape
408
+ audio_embeds_vf = audio_embeds_vf.view(batch_size_vf, window_size_vf * blocks_vf * channels_vf)
409
+
410
+ # first projection
411
+ audio_embeds = torch.relu(self.proj1(audio_embeds))
412
+ audio_embeds_vf = torch.relu(self.proj1_vf(audio_embeds_vf))
413
+ audio_embeds = rearrange(audio_embeds, "(bz f) c -> bz f c", bz=B)
414
+ audio_embeds_vf = rearrange(audio_embeds_vf, "(bz f) c -> bz f c", bz=B)
415
+ audio_embeds_c = torch.concat([audio_embeds, audio_embeds_vf], dim=1)
416
+ batch_size_c, N_t, C_a = audio_embeds_c.shape
417
+ audio_embeds_c = audio_embeds_c.view(batch_size_c*N_t, C_a)
418
+
419
+ # second projection
420
+ audio_embeds_c = torch.relu(self.proj2(audio_embeds_c))
421
+
422
+ context_tokens = self.proj3(audio_embeds_c).reshape(batch_size_c*N_t, self.context_tokens, self.output_dim)
423
+
424
+ # normalization and reshape
425
+ with amp.autocast(dtype=torch.float32):
426
+ context_tokens = self.norm(context_tokens)
427
+ context_tokens = rearrange(context_tokens, "(bz f) m c -> bz f m c", f=video_length)
428
+
429
+ return context_tokens
430
+
431
+
432
+ class WanModel(ModelMixin, ConfigMixin):
433
+ r"""
434
+ Wan diffusion backbone supporting both text-to-video and image-to-video.
435
+ """
436
+
437
+ ignore_for_config = [
438
+ 'patch_size', 'cross_attn_norm', 'qk_norm', 'text_dim', 'window_size'
439
+ ]
440
+ _no_split_modules = ['WanAttentionBlock']
441
+
442
+ @register_to_config
443
+ def __init__(self,
444
+ model_type='i2v',
445
+ patch_size=(1, 2, 2),
446
+ text_len=512,
447
+ in_dim=16,
448
+ dim=2048,
449
+ ffn_dim=8192,
450
+ freq_dim=256,
451
+ text_dim=4096,
452
+ out_dim=16,
453
+ num_heads=16,
454
+ num_layers=32,
455
+ window_size=(-1, -1),
456
+ qk_norm=True,
457
+ cross_attn_norm=True,
458
+ eps=1e-6,
459
+ # audio params
460
+ audio_window=5,
461
+ intermediate_dim=512,
462
+ output_dim=768,
463
+ context_tokens=32,
464
+ vae_scale=4, # vae timedownsample scale
465
+
466
+ norm_input_visual=True,
467
+ norm_output_audio=True,
468
+ weight_init=True):
469
+ super().__init__()
470
+
471
+ assert model_type == 'i2v', 'MultiTalk model requires your model_type is i2v.'
472
+ self.model_type = model_type
473
+
474
+ self.patch_size = patch_size
475
+ self.text_len = text_len
476
+ self.in_dim = in_dim
477
+ self.dim = dim
478
+ self.ffn_dim = ffn_dim
479
+ self.freq_dim = freq_dim
480
+ self.text_dim = text_dim
481
+ self.out_dim = out_dim
482
+ self.num_heads = num_heads
483
+ self.num_layers = num_layers
484
+ self.window_size = window_size
485
+ self.qk_norm = qk_norm
486
+ self.cross_attn_norm = cross_attn_norm
487
+ self.eps = eps
488
+
489
+
490
+ self.norm_output_audio = norm_output_audio
491
+ self.audio_window = audio_window
492
+ self.intermediate_dim = intermediate_dim
493
+ self.vae_scale = vae_scale
494
+
495
+
496
+ # embeddings
497
+ self.patch_embedding = nn.Conv3d(
498
+ in_dim, dim, kernel_size=patch_size, stride=patch_size)
499
+ self.text_embedding = nn.Sequential(
500
+ nn.Linear(text_dim, dim), nn.GELU(approximate='tanh'),
501
+ nn.Linear(dim, dim))
502
+
503
+ self.time_embedding = nn.Sequential(
504
+ nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
505
+ self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
506
+
507
+ # blocks
508
+ cross_attn_type = 'i2v_cross_attn'
509
+ self.blocks = nn.ModuleList([
510
+ WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
511
+ window_size, qk_norm, cross_attn_norm, eps,
512
+ output_dim=output_dim, norm_input_visual=norm_input_visual)
513
+ for _ in range(num_layers)
514
+ ])
515
+
516
+ # head
517
+ self.head = Head(dim, out_dim, patch_size, eps)
518
+
519
+ assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
520
+ d = dim // num_heads
521
+ self.freqs = torch.cat([
522
+ rope_params(1024, d - 4 * (d // 6)),
523
+ rope_params(1024, 2 * (d // 6)),
524
+ rope_params(1024, 2 * (d // 6))
525
+ ],
526
+ dim=1)
527
+
528
+ if model_type == 'i2v':
529
+ self.img_emb = MLPProj(1280, dim)
530
+ else:
531
+ raise NotImplementedError('Not supported model type.')
532
+
533
+ # init audio adapter
534
+ self.audio_proj = AudioProjModel(
535
+ seq_len=audio_window,
536
+ seq_len_vf=audio_window+vae_scale-1,
537
+ intermediate_dim=intermediate_dim,
538
+ output_dim=output_dim,
539
+ context_tokens=context_tokens,
540
+ norm_output_audio=norm_output_audio,
541
+ )
542
+
543
+
544
+ # initialize weights
545
+ if weight_init:
546
+ self.init_weights()
547
+
548
+ def init_freqs(self):
549
+ d = self.dim // self.num_heads
550
+ self.freqs = torch.cat([
551
+ rope_params(1024, d - 4 * (d // 6)),
552
+ rope_params(1024, 2 * (d // 6)),
553
+ rope_params(1024, 2 * (d // 6))
554
+ ],
555
+ dim=1)
556
+
557
+ def teacache_init(
558
+ self,
559
+ use_ret_steps=True,
560
+ teacache_thresh=0.2,
561
+ sample_steps=40,
562
+ model_scale='infinitetalk-480',
563
+ ):
564
+ print("teacache_init")
565
+ self.enable_teacache = True
566
+
567
+ self.__class__.cnt = 0
568
+ self.__class__.num_steps = sample_steps*3
569
+ self.__class__.teacache_thresh = teacache_thresh
570
+ self.__class__.accumulated_rel_l1_distance_even = 0
571
+ self.__class__.accumulated_rel_l1_distance_odd = 0
572
+ self.__class__.previous_e0_even = None
573
+ self.__class__.previous_e0_odd = None
574
+ self.__class__.previous_residual_even = None
575
+ self.__class__.previous_residual_odd = None
576
+ self.__class__.use_ret_steps = use_ret_steps
577
+
578
+ if use_ret_steps:
579
+ if model_scale == 'infinitetalk-480':
580
+ self.__class__.coefficients = [ 2.57151496e+05, -3.54229917e+04, 1.40286849e+03, -1.35890334e+01, 1.32517977e-01]
581
+ if model_scale == 'infinitetalk-720':
582
+ self.__class__.coefficients = [ 8.10705460e+03, 2.13393892e+03, -3.72934672e+02, 1.66203073e+01, -4.17769401e-02]
583
+ self.__class__.ret_steps = 5*3
584
+ self.__class__.cutoff_steps = sample_steps*3
585
+ else:
586
+ if model_scale == 'infinitetalk-480':
587
+ self.__class__.coefficients = [-3.02331670e+02, 2.23948934e+02, -5.25463970e+01, 5.87348440e+00, -2.01973289e-01]
588
+
589
+ if model_scale == 'infinitetalk-720':
590
+ self.__class__.coefficients = [-114.36346466, 65.26524496, -18.82220707, 4.91518089, -0.23412683]
591
+ self.__class__.ret_steps = 1*3
592
+ self.__class__.cutoff_steps = sample_steps*3 - 3
593
+ print("teacache_init done")
594
+
595
+ def disable_teacache(self):
596
+ self.enable_teacache = False
597
+
598
+ def forward(
599
+ self,
600
+ x,
601
+ t,
602
+ context,
603
+ seq_len,
604
+ clip_fea=None,
605
+ y=None,
606
+ audio=None,
607
+ ref_target_masks=None,
608
+ ):
609
+ assert clip_fea is not None and y is not None
610
+
611
+ _, T, H, W = x[0].shape
612
+ N_t = T // self.patch_size[0]
613
+ N_h = H // self.patch_size[1]
614
+ N_w = W // self.patch_size[2]
615
+
616
+ if y is not None:
617
+ x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
618
+ x[0] = x[0].to(context[0].dtype)
619
+
620
+ # embeddings
621
+ x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
622
+ grid_sizes = torch.stack(
623
+ [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
624
+ x = [u.flatten(2).transpose(1, 2) for u in x]
625
+ seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
626
+ assert seq_lens.max() <= seq_len
627
+ x = torch.cat([
628
+ torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
629
+ dim=1) for u in x
630
+ ])
631
+
632
+ # time embeddings
633
+ with amp.autocast(dtype=torch.float32):
634
+ e = self.time_embedding(
635
+ sinusoidal_embedding_1d(self.freq_dim, t).float())
636
+ e0 = self.time_projection(e).unflatten(1, (6, self.dim))
637
+ assert e.dtype == torch.float32 and e0.dtype == torch.float32
638
+
639
+ # text embedding
640
+ context_lens = None
641
+ context = self.text_embedding(
642
+ torch.stack([
643
+ torch.cat(
644
+ [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
645
+ for u in context
646
+ ]))
647
+
648
+ # clip embedding
649
+ if clip_fea is not None:
650
+ context_clip = self.img_emb(clip_fea)
651
+ context = torch.concat([context_clip, context], dim=1).to(x.dtype)
652
+
653
+
654
+ audio_cond = audio.to(device=x.device, dtype=x.dtype)
655
+ first_frame_audio_emb_s = audio_cond[:, :1, ...]
656
+ latter_frame_audio_emb = audio_cond[:, 1:, ...]
657
+ latter_frame_audio_emb = rearrange(latter_frame_audio_emb, "b (n_t n) w s c -> b n_t n w s c", n=self.vae_scale)
658
+ middle_index = self.audio_window // 2
659
+ latter_first_frame_audio_emb = latter_frame_audio_emb[:, :, :1, :middle_index+1, ...]
660
+ latter_first_frame_audio_emb = rearrange(latter_first_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
661
+ latter_last_frame_audio_emb = latter_frame_audio_emb[:, :, -1:, middle_index:, ...]
662
+ latter_last_frame_audio_emb = rearrange(latter_last_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
663
+ latter_middle_frame_audio_emb = latter_frame_audio_emb[:, :, 1:-1, middle_index:middle_index+1, ...]
664
+ latter_middle_frame_audio_emb = rearrange(latter_middle_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
665
+ latter_frame_audio_emb_s = torch.concat([latter_first_frame_audio_emb, latter_middle_frame_audio_emb, latter_last_frame_audio_emb], dim=2)
666
+ audio_embedding = self.audio_proj(first_frame_audio_emb_s, latter_frame_audio_emb_s)
667
+ human_num = len(audio_embedding)
668
+ audio_embedding = torch.concat(audio_embedding.split(1), dim=2).to(x.dtype)
669
+
670
+
671
+ # convert ref_target_masks to token_ref_target_masks
672
+ if ref_target_masks is not None:
673
+ ref_target_masks = ref_target_masks.unsqueeze(0).to(torch.float32)
674
+ token_ref_target_masks = nn.functional.interpolate(ref_target_masks, size=(N_h, N_w), mode='nearest')
675
+ token_ref_target_masks = token_ref_target_masks.squeeze(0)
676
+ token_ref_target_masks = (token_ref_target_masks > 0)
677
+ token_ref_target_masks = token_ref_target_masks.view(token_ref_target_masks.shape[0], -1)
678
+ token_ref_target_masks = token_ref_target_masks.to(x.dtype)
679
+
680
+ # teacache
681
+ if self.enable_teacache:
682
+ modulated_inp = e0 if self.use_ret_steps else e
683
+ if self.cnt%3==0: # cond
684
+ if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
685
+ should_calc_cond = True
686
+ self.accumulated_rel_l1_distance_cond = 0
687
+ else:
688
+ rescale_func = np.poly1d(self.coefficients)
689
+ self.accumulated_rel_l1_distance_cond += rescale_func(((modulated_inp-self.previous_e0_cond).abs().mean() / self.previous_e0_cond.abs().mean()).cpu().item())
690
+ if self.accumulated_rel_l1_distance_cond < self.teacache_thresh:
691
+ should_calc_cond = False
692
+ else:
693
+ should_calc_cond = True
694
+ self.accumulated_rel_l1_distance_cond = 0
695
+ self.previous_e0_cond = modulated_inp.clone()
696
+ elif self.cnt%3==1: # drop_text
697
+ if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
698
+ should_calc_drop_text = True
699
+ self.accumulated_rel_l1_distance_drop_text = 0
700
+ else:
701
+ rescale_func = np.poly1d(self.coefficients)
702
+ self.accumulated_rel_l1_distance_drop_text += rescale_func(((modulated_inp-self.previous_e0_drop_text).abs().mean() / self.previous_e0_drop_text.abs().mean()).cpu().item())
703
+ if self.accumulated_rel_l1_distance_drop_text < self.teacache_thresh:
704
+ should_calc_drop_text = False
705
+ else:
706
+ should_calc_drop_text = True
707
+ self.accumulated_rel_l1_distance_drop_text = 0
708
+ self.previous_e0_drop_text = modulated_inp.clone()
709
+ else: # uncond
710
+ if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
711
+ should_calc_uncond = True
712
+ self.accumulated_rel_l1_distance_uncond = 0
713
+ else:
714
+ rescale_func = np.poly1d(self.coefficients)
715
+ self.accumulated_rel_l1_distance_uncond += rescale_func(((modulated_inp-self.previous_e0_uncond).abs().mean() / self.previous_e0_uncond.abs().mean()).cpu().item())
716
+ if self.accumulated_rel_l1_distance_uncond < self.teacache_thresh:
717
+ should_calc_uncond = False
718
+ else:
719
+ should_calc_uncond = True
720
+ self.accumulated_rel_l1_distance_uncond = 0
721
+ self.previous_e0_uncond = modulated_inp.clone()
722
+
723
+ # arguments
724
+ kwargs = dict(
725
+ e=e0,
726
+ seq_lens=seq_lens,
727
+ grid_sizes=grid_sizes,
728
+ freqs=self.freqs,
729
+ context=context,
730
+ context_lens=context_lens,
731
+ audio_embedding=audio_embedding,
732
+ ref_target_masks=token_ref_target_masks,
733
+ human_num=human_num,
734
+ )
735
+ if self.enable_teacache:
736
+ if self.cnt%3==0:
737
+ if not should_calc_cond:
738
+ x += self.previous_residual_cond
739
+ else:
740
+ ori_x = x.clone()
741
+ for block in self.blocks:
742
+ x = block(x, **kwargs)
743
+ self.previous_residual_cond = x - ori_x
744
+ elif self.cnt%3==1:
745
+ if not should_calc_drop_text:
746
+ x += self.previous_residual_drop_text
747
+ else:
748
+ ori_x = x.clone()
749
+ for block in self.blocks:
750
+ x = block(x, **kwargs)
751
+ self.previous_residual_drop_text = x - ori_x
752
+ else:
753
+ if not should_calc_uncond:
754
+ x += self.previous_residual_uncond
755
+ else:
756
+ ori_x = x.clone()
757
+ for block in self.blocks:
758
+ x = block(x, **kwargs)
759
+ self.previous_residual_uncond = x - ori_x
760
+ else:
761
+ for block in self.blocks:
762
+ x = block(x, **kwargs)
763
+
764
+ # head
765
+ x = self.head(x, e)
766
+
767
+ # unpatchify
768
+ x = self.unpatchify(x, grid_sizes)
769
+ if self.enable_teacache:
770
+ self.cnt += 1
771
+ if self.cnt >= self.num_steps:
772
+ self.cnt = 0
773
+
774
+ return torch.stack(x).float()
775
+
776
+
777
+ def unpatchify(self, x, grid_sizes):
778
+ r"""
779
+ Reconstruct video tensors from patch embeddings.
780
+
781
+ Args:
782
+ x (List[Tensor]):
783
+ List of patchified features, each with shape [L, C_out * prod(patch_size)]
784
+ grid_sizes (Tensor):
785
+ Original spatial-temporal grid dimensions before patching,
786
+ shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
787
+
788
+ Returns:
789
+ List[Tensor]:
790
+ Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
791
+ """
792
+
793
+ c = self.out_dim
794
+ out = []
795
+ for u, v in zip(x, grid_sizes.tolist()):
796
+ u = u[:math.prod(v)].view(*v, *self.patch_size, c)
797
+ u = torch.einsum('fhwpqrc->cfphqwr', u)
798
+ u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
799
+ out.append(u)
800
+ return out
801
+
802
+ def init_weights(self):
803
+ r"""
804
+ Initialize model parameters using Xavier initialization.
805
+ """
806
+
807
+ # basic init
808
+ for m in self.modules():
809
+ if isinstance(m, nn.Linear):
810
+ nn.init.xavier_uniform_(m.weight)
811
+ if m.bias is not None:
812
+ nn.init.zeros_(m.bias)
813
+
814
+ # init embeddings
815
+ nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
816
+ for m in self.text_embedding.modules():
817
+ if isinstance(m, nn.Linear):
818
+ nn.init.normal_(m.weight, std=.02)
819
+ for m in self.time_embedding.modules():
820
+ if isinstance(m, nn.Linear):
821
+ nn.init.normal_(m.weight, std=.02)
822
+
823
+ # init output layer
824
+ nn.init.zeros_(self.head.head.weight)
wan/modules/t5.py ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Modified from transformers.models.t5.modeling_t5
2
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
3
+ import logging
4
+ import math
5
+ import json
6
+ import os
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+
12
+ from safetensors.torch import load_file
13
+ from optimum.quanto import quantize, freeze, qint8,requantize
14
+
15
+ from .tokenizers import HuggingfaceTokenizer
16
+
17
+ __all__ = [
18
+ 'T5Model',
19
+ 'T5Encoder',
20
+ 'T5Decoder',
21
+ 'T5EncoderModel',
22
+ ]
23
+
24
+
25
+ def fp16_clamp(x):
26
+ if x.dtype == torch.float16 and torch.isinf(x).any():
27
+ clamp = torch.finfo(x.dtype).max - 1000
28
+ x = torch.clamp(x, min=-clamp, max=clamp)
29
+ return x
30
+
31
+
32
+ def init_weights(m):
33
+ if isinstance(m, T5LayerNorm):
34
+ nn.init.ones_(m.weight)
35
+ elif isinstance(m, T5Model):
36
+ nn.init.normal_(m.token_embedding.weight, std=1.0)
37
+ elif isinstance(m, T5FeedForward):
38
+ nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
39
+ nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
40
+ nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
41
+ elif isinstance(m, T5Attention):
42
+ nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
43
+ nn.init.normal_(m.k.weight, std=m.dim**-0.5)
44
+ nn.init.normal_(m.v.weight, std=m.dim**-0.5)
45
+ nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
46
+ elif isinstance(m, T5RelativeEmbedding):
47
+ nn.init.normal_(
48
+ m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)
49
+
50
+
51
+ class GELU(nn.Module):
52
+
53
+ def forward(self, x):
54
+ return 0.5 * x * (1.0 + torch.tanh(
55
+ math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
56
+
57
+
58
+ class T5LayerNorm(nn.Module):
59
+
60
+ def __init__(self, dim, eps=1e-6):
61
+ super(T5LayerNorm, self).__init__()
62
+ self.dim = dim
63
+ self.eps = eps
64
+ self.weight = nn.Parameter(torch.ones(dim))
65
+
66
+ def forward(self, x):
67
+ x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) +
68
+ self.eps)
69
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
70
+ x = x.type_as(self.weight)
71
+ return self.weight * x
72
+
73
+
74
+ class T5Attention(nn.Module):
75
+
76
+ def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
77
+ assert dim_attn % num_heads == 0
78
+ super(T5Attention, self).__init__()
79
+ self.dim = dim
80
+ self.dim_attn = dim_attn
81
+ self.num_heads = num_heads
82
+ self.head_dim = dim_attn // num_heads
83
+
84
+ # layers
85
+ self.q = nn.Linear(dim, dim_attn, bias=False)
86
+ self.k = nn.Linear(dim, dim_attn, bias=False)
87
+ self.v = nn.Linear(dim, dim_attn, bias=False)
88
+ self.o = nn.Linear(dim_attn, dim, bias=False)
89
+ self.dropout = nn.Dropout(dropout)
90
+
91
+ def forward(self, x, context=None, mask=None, pos_bias=None):
92
+ """
93
+ x: [B, L1, C].
94
+ context: [B, L2, C] or None.
95
+ mask: [B, L2] or [B, L1, L2] or None.
96
+ """
97
+ # check inputs
98
+ context = x if context is None else context
99
+ b, n, c = x.size(0), self.num_heads, self.head_dim
100
+
101
+ # compute query, key, value
102
+ q = self.q(x).view(b, -1, n, c)
103
+ k = self.k(context).view(b, -1, n, c)
104
+ v = self.v(context).view(b, -1, n, c)
105
+
106
+ # attention bias
107
+ attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
108
+ if pos_bias is not None:
109
+ attn_bias += pos_bias
110
+ if mask is not None:
111
+ assert mask.ndim in [2, 3]
112
+ mask = mask.view(b, 1, 1,
113
+ -1) if mask.ndim == 2 else mask.unsqueeze(1)
114
+ attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
115
+
116
+ # compute attention (T5 does not use scaling)
117
+ attn = torch.einsum('binc,bjnc->bnij', q, k) + attn_bias
118
+ attn = F.softmax(attn.float(), dim=-1).type_as(attn)
119
+ x = torch.einsum('bnij,bjnc->binc', attn, v)
120
+
121
+ # output
122
+ x = x.reshape(b, -1, n * c)
123
+ x = self.o(x)
124
+ x = self.dropout(x)
125
+ return x
126
+
127
+
128
+ class T5FeedForward(nn.Module):
129
+
130
+ def __init__(self, dim, dim_ffn, dropout=0.1):
131
+ super(T5FeedForward, self).__init__()
132
+ self.dim = dim
133
+ self.dim_ffn = dim_ffn
134
+
135
+ # layers
136
+ self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
137
+ self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
138
+ self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
139
+ self.dropout = nn.Dropout(dropout)
140
+
141
+ def forward(self, x):
142
+ x = self.fc1(x) * self.gate(x)
143
+ x = self.dropout(x)
144
+ x = self.fc2(x)
145
+ x = self.dropout(x)
146
+ return x
147
+
148
+
149
+ class T5SelfAttention(nn.Module):
150
+
151
+ def __init__(self,
152
+ dim,
153
+ dim_attn,
154
+ dim_ffn,
155
+ num_heads,
156
+ num_buckets,
157
+ shared_pos=True,
158
+ dropout=0.1):
159
+ super(T5SelfAttention, self).__init__()
160
+ self.dim = dim
161
+ self.dim_attn = dim_attn
162
+ self.dim_ffn = dim_ffn
163
+ self.num_heads = num_heads
164
+ self.num_buckets = num_buckets
165
+ self.shared_pos = shared_pos
166
+
167
+ # layers
168
+ self.norm1 = T5LayerNorm(dim)
169
+ self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
170
+ self.norm2 = T5LayerNorm(dim)
171
+ self.ffn = T5FeedForward(dim, dim_ffn, dropout)
172
+ self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
173
+ num_buckets, num_heads, bidirectional=True)
174
+
175
+ def forward(self, x, mask=None, pos_bias=None):
176
+ e = pos_bias if self.shared_pos else self.pos_embedding(
177
+ x.size(1), x.size(1))
178
+ x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
179
+ x = fp16_clamp(x + self.ffn(self.norm2(x)))
180
+ return x
181
+
182
+
183
+ class T5CrossAttention(nn.Module):
184
+
185
+ def __init__(self,
186
+ dim,
187
+ dim_attn,
188
+ dim_ffn,
189
+ num_heads,
190
+ num_buckets,
191
+ shared_pos=True,
192
+ dropout=0.1):
193
+ super(T5CrossAttention, self).__init__()
194
+ self.dim = dim
195
+ self.dim_attn = dim_attn
196
+ self.dim_ffn = dim_ffn
197
+ self.num_heads = num_heads
198
+ self.num_buckets = num_buckets
199
+ self.shared_pos = shared_pos
200
+
201
+ # layers
202
+ self.norm1 = T5LayerNorm(dim)
203
+ self.self_attn = T5Attention(dim, dim_attn, num_heads, dropout)
204
+ self.norm2 = T5LayerNorm(dim)
205
+ self.cross_attn = T5Attention(dim, dim_attn, num_heads, dropout)
206
+ self.norm3 = T5LayerNorm(dim)
207
+ self.ffn = T5FeedForward(dim, dim_ffn, dropout)
208
+ self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
209
+ num_buckets, num_heads, bidirectional=False)
210
+
211
+ def forward(self,
212
+ x,
213
+ mask=None,
214
+ encoder_states=None,
215
+ encoder_mask=None,
216
+ pos_bias=None):
217
+ e = pos_bias if self.shared_pos else self.pos_embedding(
218
+ x.size(1), x.size(1))
219
+ x = fp16_clamp(x + self.self_attn(self.norm1(x), mask=mask, pos_bias=e))
220
+ x = fp16_clamp(x + self.cross_attn(
221
+ self.norm2(x), context=encoder_states, mask=encoder_mask))
222
+ x = fp16_clamp(x + self.ffn(self.norm3(x)))
223
+ return x
224
+
225
+
226
+ class T5RelativeEmbedding(nn.Module):
227
+
228
+ def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
229
+ super(T5RelativeEmbedding, self).__init__()
230
+ self.num_buckets = num_buckets
231
+ self.num_heads = num_heads
232
+ self.bidirectional = bidirectional
233
+ self.max_dist = max_dist
234
+
235
+ # layers
236
+ self.embedding = nn.Embedding(num_buckets, num_heads)
237
+
238
+ def forward(self, lq, lk):
239
+ device = self.embedding.weight.device
240
+ # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
241
+ # torch.arange(lq).unsqueeze(1).to(device)
242
+ rel_pos = torch.arange(lk, device=device).unsqueeze(0) - \
243
+ torch.arange(lq, device=device).unsqueeze(1)
244
+ rel_pos = self._relative_position_bucket(rel_pos)
245
+ rel_pos_embeds = self.embedding(rel_pos)
246
+ rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(
247
+ 0) # [1, N, Lq, Lk]
248
+ return rel_pos_embeds.contiguous()
249
+
250
+ def _relative_position_bucket(self, rel_pos):
251
+ # preprocess
252
+ if self.bidirectional:
253
+ num_buckets = self.num_buckets // 2
254
+ rel_buckets = (rel_pos > 0).long() * num_buckets
255
+ rel_pos = torch.abs(rel_pos)
256
+ else:
257
+ num_buckets = self.num_buckets
258
+ rel_buckets = 0
259
+ rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
260
+
261
+ # embeddings for small and large positions
262
+ max_exact = num_buckets // 2
263
+ rel_pos_large = max_exact + (torch.log(rel_pos.float() / max_exact) /
264
+ math.log(self.max_dist / max_exact) *
265
+ (num_buckets - max_exact)).long()
266
+ rel_pos_large = torch.min(
267
+ rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
268
+ rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
269
+ return rel_buckets
270
+
271
+
272
+ class T5Encoder(nn.Module):
273
+
274
+ def __init__(self,
275
+ vocab,
276
+ dim,
277
+ dim_attn,
278
+ dim_ffn,
279
+ num_heads,
280
+ num_layers,
281
+ num_buckets,
282
+ shared_pos=True,
283
+ dropout=0.1):
284
+ super(T5Encoder, self).__init__()
285
+ self.dim = dim
286
+ self.dim_attn = dim_attn
287
+ self.dim_ffn = dim_ffn
288
+ self.num_heads = num_heads
289
+ self.num_layers = num_layers
290
+ self.num_buckets = num_buckets
291
+ self.shared_pos = shared_pos
292
+
293
+ # layers
294
+ self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
295
+ else nn.Embedding(vocab, dim)
296
+ self.pos_embedding = T5RelativeEmbedding(
297
+ num_buckets, num_heads, bidirectional=True) if shared_pos else None
298
+ self.dropout = nn.Dropout(dropout)
299
+ self.blocks = nn.ModuleList([
300
+ T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
301
+ shared_pos, dropout) for _ in range(num_layers)
302
+ ])
303
+ self.norm = T5LayerNorm(dim)
304
+
305
+ # initialize weights
306
+ self.apply(init_weights)
307
+
308
+ def forward(self, ids, mask=None):
309
+ x = self.token_embedding(ids)
310
+ x = self.dropout(x)
311
+ e = self.pos_embedding(x.size(1),
312
+ x.size(1)) if self.shared_pos else None
313
+ for block in self.blocks:
314
+ x = block(x, mask, pos_bias=e)
315
+ x = self.norm(x)
316
+ x = self.dropout(x)
317
+ return x
318
+
319
+
320
+ class T5Decoder(nn.Module):
321
+
322
+ def __init__(self,
323
+ vocab,
324
+ dim,
325
+ dim_attn,
326
+ dim_ffn,
327
+ num_heads,
328
+ num_layers,
329
+ num_buckets,
330
+ shared_pos=True,
331
+ dropout=0.1):
332
+ super(T5Decoder, self).__init__()
333
+ self.dim = dim
334
+ self.dim_attn = dim_attn
335
+ self.dim_ffn = dim_ffn
336
+ self.num_heads = num_heads
337
+ self.num_layers = num_layers
338
+ self.num_buckets = num_buckets
339
+ self.shared_pos = shared_pos
340
+
341
+ # layers
342
+ self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
343
+ else nn.Embedding(vocab, dim)
344
+ self.pos_embedding = T5RelativeEmbedding(
345
+ num_buckets, num_heads, bidirectional=False) if shared_pos else None
346
+ self.dropout = nn.Dropout(dropout)
347
+ self.blocks = nn.ModuleList([
348
+ T5CrossAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
349
+ shared_pos, dropout) for _ in range(num_layers)
350
+ ])
351
+ self.norm = T5LayerNorm(dim)
352
+
353
+ # initialize weights
354
+ self.apply(init_weights)
355
+
356
+ def forward(self, ids, mask=None, encoder_states=None, encoder_mask=None):
357
+ b, s = ids.size()
358
+
359
+ # causal mask
360
+ if mask is None:
361
+ mask = torch.tril(torch.ones(1, s, s).to(ids.device))
362
+ elif mask.ndim == 2:
363
+ mask = torch.tril(mask.unsqueeze(1).expand(-1, s, -1))
364
+
365
+ # layers
366
+ x = self.token_embedding(ids)
367
+ x = self.dropout(x)
368
+ e = self.pos_embedding(x.size(1),
369
+ x.size(1)) if self.shared_pos else None
370
+ for block in self.blocks:
371
+ x = block(x, mask, encoder_states, encoder_mask, pos_bias=e)
372
+ x = self.norm(x)
373
+ x = self.dropout(x)
374
+ return x
375
+
376
+
377
+ class T5Model(nn.Module):
378
+
379
+ def __init__(self,
380
+ vocab_size,
381
+ dim,
382
+ dim_attn,
383
+ dim_ffn,
384
+ num_heads,
385
+ encoder_layers,
386
+ decoder_layers,
387
+ num_buckets,
388
+ shared_pos=True,
389
+ dropout=0.1):
390
+ super(T5Model, self).__init__()
391
+ self.vocab_size = vocab_size
392
+ self.dim = dim
393
+ self.dim_attn = dim_attn
394
+ self.dim_ffn = dim_ffn
395
+ self.num_heads = num_heads
396
+ self.encoder_layers = encoder_layers
397
+ self.decoder_layers = decoder_layers
398
+ self.num_buckets = num_buckets
399
+
400
+ # layers
401
+ self.token_embedding = nn.Embedding(vocab_size, dim)
402
+ self.encoder = T5Encoder(self.token_embedding, dim, dim_attn, dim_ffn,
403
+ num_heads, encoder_layers, num_buckets,
404
+ shared_pos, dropout)
405
+ self.decoder = T5Decoder(self.token_embedding, dim, dim_attn, dim_ffn,
406
+ num_heads, decoder_layers, num_buckets,
407
+ shared_pos, dropout)
408
+ self.head = nn.Linear(dim, vocab_size, bias=False)
409
+
410
+ # initialize weights
411
+ self.apply(init_weights)
412
+
413
+ def forward(self, encoder_ids, encoder_mask, decoder_ids, decoder_mask):
414
+ x = self.encoder(encoder_ids, encoder_mask)
415
+ x = self.decoder(decoder_ids, decoder_mask, x, encoder_mask)
416
+ x = self.head(x)
417
+ return x
418
+
419
+
420
+ def _t5(name,
421
+ encoder_only=False,
422
+ decoder_only=False,
423
+ return_tokenizer=False,
424
+ tokenizer_kwargs={},
425
+ dtype=torch.float32,
426
+ device='cpu',
427
+ **kwargs):
428
+ # sanity check
429
+ assert not (encoder_only and decoder_only)
430
+
431
+ # params
432
+ if encoder_only:
433
+ model_cls = T5Encoder
434
+ kwargs['vocab'] = kwargs.pop('vocab_size')
435
+ kwargs['num_layers'] = kwargs.pop('encoder_layers')
436
+ _ = kwargs.pop('decoder_layers')
437
+ elif decoder_only:
438
+ model_cls = T5Decoder
439
+ kwargs['vocab'] = kwargs.pop('vocab_size')
440
+ kwargs['num_layers'] = kwargs.pop('decoder_layers')
441
+ _ = kwargs.pop('encoder_layers')
442
+ else:
443
+ model_cls = T5Model
444
+
445
+ # init model
446
+ with torch.device(device):
447
+ model = model_cls(**kwargs)
448
+
449
+ # set device
450
+ model = model.to(dtype=dtype, device=device)
451
+
452
+ # init tokenizer
453
+ if return_tokenizer:
454
+ from .tokenizers import HuggingfaceTokenizer
455
+ tokenizer = HuggingfaceTokenizer(f'google/{name}', **tokenizer_kwargs)
456
+ return model, tokenizer
457
+ else:
458
+ return model
459
+
460
+
461
+ def umt5_xxl(**kwargs):
462
+ cfg = dict(
463
+ vocab_size=256384,
464
+ dim=4096,
465
+ dim_attn=4096,
466
+ dim_ffn=10240,
467
+ num_heads=64,
468
+ encoder_layers=24,
469
+ decoder_layers=24,
470
+ num_buckets=32,
471
+ shared_pos=False,
472
+ dropout=0.1)
473
+ cfg.update(**kwargs)
474
+ return _t5('umt5-xxl', **cfg)
475
+
476
+
477
+ class T5EncoderModel:
478
+
479
+ def __init__(
480
+ self,
481
+ text_len,
482
+ dtype=torch.bfloat16,
483
+ device=torch.cuda.current_device(),
484
+ checkpoint_path=None,
485
+ tokenizer_path=None,
486
+ shard_fn=None,
487
+ quant=None,
488
+ quant_dir=None
489
+ ):
490
+ assert quant is None or quant in ("int8", "fp8")
491
+ self.text_len = text_len
492
+ self.dtype = dtype
493
+ self.device = device
494
+ self.checkpoint_path = checkpoint_path
495
+ self.tokenizer_path = tokenizer_path
496
+
497
+ # init model
498
+ logging.info(f'loading {checkpoint_path}')
499
+ if quant is not None:
500
+ with torch.device('meta'):
501
+ model = umt5_xxl(
502
+ encoder_only=True,
503
+ return_tokenizer=False,
504
+ dtype=dtype,
505
+ device=torch.device('meta'))
506
+ logging.info(f'Loading quantized T5 from {os.path.join(quant_dir, f"t5_{quant}.safetensors")}')
507
+ model_state_dict = load_file(os.path.join(quant_dir, f"t5_{quant}.safetensors"))
508
+ with open(os.path.join(quant_dir, f"t5_map_{quant}.json"), "r") as f:
509
+ quantization_map = json.load(f)
510
+ requantize(model, model_state_dict, quantization_map, device='cpu')
511
+ else:
512
+ model = umt5_xxl(
513
+ encoder_only=True,
514
+ return_tokenizer=False,
515
+ dtype=dtype,
516
+ device=device).eval().requires_grad_(False)
517
+ model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))
518
+ self.model = model
519
+ self.model.eval().requires_grad_(False)
520
+ if shard_fn is not None:
521
+ self.model = shard_fn(self.model, sync_module_states=False)
522
+ else:
523
+ self.model.to(self.device)
524
+ # init tokenizer
525
+ self.tokenizer = HuggingfaceTokenizer(
526
+ name=tokenizer_path, seq_len=text_len, clean='whitespace')
527
+
528
+ def __call__(self, texts, device):
529
+ ids, mask = self.tokenizer(
530
+ texts, return_mask=True, add_special_tokens=True)
531
+ ids = ids.to(device)
532
+ mask = mask.to(device)
533
+ seq_lens = mask.gt(0).sum(dim=1).long()
534
+ context = self.model(ids, mask)
535
+ return [u[:v] for u, v in zip(context, seq_lens)]
wan/modules/tokenizers.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2
+ import html
3
+ import string
4
+
5
+ import ftfy
6
+ import regex as re
7
+ from transformers import AutoTokenizer
8
+
9
+ __all__ = ['HuggingfaceTokenizer']
10
+
11
+
12
+ def basic_clean(text):
13
+ text = ftfy.fix_text(text)
14
+ text = html.unescape(html.unescape(text))
15
+ return text.strip()
16
+
17
+
18
+ def whitespace_clean(text):
19
+ text = re.sub(r'\s+', ' ', text)
20
+ text = text.strip()
21
+ return text
22
+
23
+
24
+ def canonicalize(text, keep_punctuation_exact_string=None):
25
+ text = text.replace('_', ' ')
26
+ if keep_punctuation_exact_string:
27
+ text = keep_punctuation_exact_string.join(
28
+ part.translate(str.maketrans('', '', string.punctuation))
29
+ for part in text.split(keep_punctuation_exact_string))
30
+ else:
31
+ text = text.translate(str.maketrans('', '', string.punctuation))
32
+ text = text.lower()
33
+ text = re.sub(r'\s+', ' ', text)
34
+ return text.strip()
35
+
36
+
37
+ class HuggingfaceTokenizer:
38
+
39
+ def __init__(self, name, seq_len=None, clean=None, **kwargs):
40
+ assert clean in (None, 'whitespace', 'lower', 'canonicalize')
41
+ self.name = name
42
+ self.seq_len = seq_len
43
+ self.clean = clean
44
+
45
+ # init tokenizer
46
+ self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
47
+ self.vocab_size = self.tokenizer.vocab_size
48
+
49
+ def __call__(self, sequence, **kwargs):
50
+ return_mask = kwargs.pop('return_mask', False)
51
+
52
+ # arguments
53
+ _kwargs = {'return_tensors': 'pt'}
54
+ if self.seq_len is not None:
55
+ _kwargs.update({
56
+ 'padding': 'max_length',
57
+ 'truncation': True,
58
+ 'max_length': self.seq_len
59
+ })
60
+ _kwargs.update(**kwargs)
61
+
62
+ # tokenization
63
+ if isinstance(sequence, str):
64
+ sequence = [sequence]
65
+ if self.clean:
66
+ sequence = [self._clean(u) for u in sequence]
67
+ ids = self.tokenizer(sequence, **_kwargs)
68
+
69
+ # output
70
+ if return_mask:
71
+ return ids.input_ids, ids.attention_mask
72
+ else:
73
+ return ids.input_ids
74
+
75
+ def _clean(self, text):
76
+ if self.clean == 'whitespace':
77
+ text = whitespace_clean(basic_clean(text))
78
+ elif self.clean == 'lower':
79
+ text = whitespace_clean(basic_clean(text)).lower()
80
+ elif self.clean == 'canonicalize':
81
+ text = canonicalize(basic_clean(text))
82
+ return text