Spaces:

JoranF
/

Wan2GP

Runtime error

App Files Files Community

JoranF commited on Jun 14

Commit

ccfe94d

verified ·

1 Parent(s): 86d959c

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
.gitignore +42 -0
LICENSE.txt +17 -0
README.md +127 -12
assets/comp_effic.png +3 -0
assets/data_for_diff_stage.jpg +3 -0
assets/i2v_res.png +3 -0
assets/logo.png +0 -0
assets/t2v_res.jpg +3 -0
assets/vben_vs_sota.png +3 -0
assets/video_dit_arch.jpg +3 -0
assets/video_vae_res.jpg +3 -0
docs/CHANGELOG.md +157 -0
docs/CLI.md +226 -0
docs/GETTING_STARTED.md +194 -0
docs/INSTALLATION.md +170 -0
docs/LORAS.md +224 -0
docs/MODELS.md +268 -0
docs/TROUBLESHOOTING.md +338 -0
docs/VACE.md +190 -0
fantasytalking/infer.py +36 -0
fantasytalking/model.py +162 -0
fantasytalking/utils.py +52 -0
hyvideo/__init__.py +0 -0
hyvideo/config.py +534 -0
hyvideo/constants.py +164 -0
hyvideo/data_kits/audio_dataset.py +170 -0
hyvideo/data_kits/audio_preprocessor.py +76 -0
hyvideo/data_kits/data_tools.py +41 -0
hyvideo/data_kits/face_align/__init__.py +1 -0
hyvideo/data_kits/face_align/align.py +34 -0
hyvideo/data_kits/face_align/detface.py +283 -0
hyvideo/diffusion/__init__.py +2 -0
hyvideo/diffusion/pipelines/__init__.py +2 -0
hyvideo/diffusion/pipelines/pipeline_hunyuan_video.py +1438 -0
hyvideo/diffusion/pipelines/pipeline_hunyuan_video_audio.py +1362 -0
hyvideo/diffusion/schedulers/__init__.py +1 -0
hyvideo/diffusion/schedulers/scheduling_flow_match_discrete.py +255 -0
hyvideo/hunyuan.py +1062 -0
hyvideo/modules/__init__.py +26 -0
hyvideo/modules/activation_layers.py +23 -0
hyvideo/modules/attenion.py +362 -0
hyvideo/modules/audio_adapters.py +220 -0
hyvideo/modules/embed_layers.py +158 -0
hyvideo/modules/mlp_layers.py +131 -0
hyvideo/modules/models.py +1159 -0
hyvideo/modules/modulate_layers.py +136 -0
hyvideo/modules/norm_layers.py +88 -0
hyvideo/modules/original models.py +760 -0
hyvideo/modules/placement.py +389 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/comp_effic.png filter=lfs diff=lfs merge=lfs -text
+assets/data_for_diff_stage.jpg filter=lfs diff=lfs merge=lfs -text
+assets/i2v_res.png filter=lfs diff=lfs merge=lfs -text
+assets/t2v_res.jpg filter=lfs diff=lfs merge=lfs -text
+assets/vben_vs_sota.png filter=lfs diff=lfs merge=lfs -text
+assets/video_dit_arch.jpg filter=lfs diff=lfs merge=lfs -text
+assets/video_vae_res.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,42 @@

+.*
+*.py[cod]
+# *.jpg
+*.jpeg
+# *.png
+*.gif
+*.bmp
+*.mp4
+*.mov
+*.mkv
+*.log
+*.zip
+*.pt
+*.pth
+*.ckpt
+*.safetensors
+*.json
+# *.txt
+*.backup
+*.pkl
+*.html
+*.pdf
+*.whl
+*.exe
+cache
+__pycache__/
+storage/
+samples/
+!.gitignore
+!requirements.txt
+.DS_Store
+*DS_Store
+google/
+Wan2.1-T2V-14B/
+Wan2.1-T2V-1.3B/
+Wan2.1-I2V-14B-480P/
+Wan2.1-I2V-14B-720P/
+outputs/
+gradio_outputs/
+ckpts/
+loras/
+loras_i2v/

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+FREE for Non Commercial USE
+You are free to:
+- Share — copy and redistribute the material in any medium or format
+- Adapt — remix, transform, and build upon the material
+The licensor cannot revoke these freedoms as long as you follow the license terms.
+Under the following terms:
+- Attribution — You must give appropriate credit , provide a link to the license, and indicate if changes were made . You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
+NonCommercial — You may not use the material for commercial purposes .
+- No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
+Notices:
+- You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation .
+No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.

README.md CHANGED Viewed

@@ -1,12 +1,127 @@
----
-title: Wan2GP
-emoji: ⚡
-colorFrom: green
-colorTo: pink
-sdk: gradio
-sdk_version: 5.34.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Wan2GP
+app_file: wgp.py
+sdk: gradio
+sdk_version: 5.23.0
+---
+# WanGP
+-----
+<p align="center">
+<b>WanGP by DeepBeepMeep : The best Open Source Video Generative Models Accessible to the GPU Poor</b>
+</p>
+WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models with:
+- Low VRAM requirements (as low as 6 GB of VRAM is sufficient for certain models)
+- Support for old GPUs (RTX 10XX, 20xx, ...)
+- Very Fast on the latest GPUs
+- Easy to use Full Web based interface
+- Auto download of the required model adapted to your specific architecture
+- Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation
+- Loras Support to customize each model
+- Queuing system : make your shopping list of videos to generate and come back later
+**Discord Server to get Help from Other Users and show your Best Videos:** https://discord.gg/g7efUW9jGV
+**Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
+## 🔥 Latest Updates
+### June 11 2025: WanGP v5.5
+👋 *Hunyuan Video Custom Audio*: it is similar to Hunyuan Video Avatar except there isn't any lower limit on the number of frames and you can use your reference images in a different context than the image itself\
+*Hunyuan Video Custom Edit*: Hunyuan Video Controlnet, use it to do inpainting and replace a person in a video while still keeping his poses. Similar to Vace but less restricted than the Wan models in terms of content...
+### June 6 2025: WanGP v5.41
+👋 Bonus release: Support for **AccVideo** Lora to speed up x2 Video generations in Wan models. Check the Loras documentation to get the usage instructions of AccVideo.\
+You will need to do a *pip install -r requirements.txt*
+### June 6 2025: WanGP v5.4
+👋 World Exclusive : **Hunyuan Video Avatar** Support ! You won't need 80 GB of VRAM nor 32 GB oF VRAM, just 10 GB of VRAM will be sufficient to generate up to 15s of high quality speech / song driven Video at a high speed with no quality degradation. Support for TeaCache included.\
+Here is a link to the original repo where you will find some very interesting documentation and examples. https://github.com/Tencent-Hunyuan/HunyuanVideo-Avatar. Kudos to the Hunyuan Video Avatar team for the best model of its kind.\
+Also many thanks to Reevoy24 for his repackaging / completing the documentation
+### May 28 2025: WanGP v5.31
+👋 Added **Phantom 14B**, a model that you can use to transfer objects / people in the video. My preference goes to Vace that remains the king of controlnets.
+VACE improvements: Better sliding window transitions, image mask support in Matanyone, new Extend Video feature, and enhanced background removal options.
+### May 26, 2025: WanGP v5.3
+👋 Settings management revolution! Now you can:
+- Select any generated video and click *Use Selected Video Settings* to instantly reuse its configuration
+- Drag & drop videos to automatically extract their settings metadata
+- Export/import settings as JSON files for easy sharing and backup
+### May 20, 2025: WanGP v5.2
+👋 **CausVid support** - Generate videos in just 4-12 steps with the new distilled Wan model! Also added experimental MoviiGen for 1080p generation (20GB+ VRAM required). Check the Loras documentation to get the usage instructions of CausVid.
+### May 18, 2025: WanGP v5.1
+👋 **LTX Video 13B Distilled** - Generate high-quality videos in less than one minute!
+### May 17, 2025: WanGP v5.0
+👋 **One App to Rule Them All!** Added Hunyuan Video and LTX Video support, plus Vace 14B and integrated prompt enhancer.
+See full changelog: **[Changelog](docs/CHANGELOG.md)**
+## 📋 Table of Contents
+- [🚀 Quick Start](#-quick-start)
+- [📦 Installation](#-installation)
+- [🎯 Usage](#-usage)
+- [📚 Documentation](#-documentation)
+- [🔗 Related Projects](#-related-projects)
+## 🚀 Quick Start
+**One-click installation:** Get started instantly with [Pinokio App](https://pinokio.computer/)
+**Manual installation:**
+```bash
+git clone https://github.com/deepbeepmeep/Wan2GP.git
+cd Wan2GP
+conda create -n wan2gp python=3.10.9
+conda activate wan2gp
+pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
+pip install -r requirements.txt
+```
+**Run the application:**
+```bash
+python wgp.py  # Text-to-video (default)
+python wgp.py --i2v  # Image-to-video
+```
+## 📦 Installation
+For detailed installation instructions for different GPU generations:
+- **[Installation Guide](docs/INSTALLATION.md)** - Complete setup instructions for RTX 10XX to RTX 50XX
+## 🎯 Usage
+### Basic Usage
+- **[Getting Started Guide](docs/GETTING_STARTED.md)** - First steps and basic usage
+- **[Models Overview](docs/MODELS.md)** - Available models and their capabilities
+### Advanced Features
+- **[Loras Guide](docs/LORAS.md)** - Using and managing Loras for customization
+- **[VACE ControlNet](docs/VACE.md)** - Advanced video control and manipulation
+- **[Command Line Reference](docs/CLI.md)** - All available command line options
+## 📚 Documentation
+- **[Changelog](docs/CHANGELOG.md)** - Latest updates and version history
+- **[Troubleshooting](docs/TROUBLESHOOTING.md)** - Common issues and solutions
+## 🔗 Related Projects
+### Other Models for the GPU Poor
+- **[HuanyuanVideoGP](https://github.com/deepbeepmeep/HunyuanVideoGP)** - One of the best open source Text to Video generators
+- **[Hunyuan3D-2GP](https://github.com/deepbeepmeep/Hunyuan3D-2GP)** - Image to 3D and text to 3D tool
+- **[FluxFillGP](https://github.com/deepbeepmeep/FluxFillGP)** - Inpainting/outpainting tools based on Flux
+- **[Cosmos1GP](https://github.com/deepbeepmeep/Cosmos1GP)** - Text to world generator and image/video to world
+- **[OminiControlGP](https://github.com/deepbeepmeep/OminiControlGP)** - Flux-derived application for object transfer
+- **[YuE GP](https://github.com/deepbeepmeep/YuEGP)** - Song generator with instruments and singer's voice
+---
+<p align="center">
+Made with ❤️ by DeepBeepMeep
+</p>

assets/comp_effic.png ADDED Viewed

Git LFS Details

SHA256: b0e225caffb4b31295ad150f95ee852e4c3dde4a00ac8f79a2ff500f2ce26b8d
Pointer size: 132 Bytes
Size of remote file: 1.79 MB

assets/data_for_diff_stage.jpg ADDED Viewed

Git LFS Details

SHA256: 59aec08409f2d46b0e640e4e120dc7cca52c08c3de56d026602dbcff1ebf241a
Pointer size: 131 Bytes
Size of remote file: 528 kB

assets/i2v_res.png ADDED Viewed

Git LFS Details

SHA256: 6823b3206d8d0cb18d3b5b949dec1217f1178109ba11f14e977b67e1f7b8a248
Pointer size: 131 Bytes
Size of remote file: 892 kB

assets/logo.png ADDED Viewed

assets/t2v_res.jpg ADDED Viewed

Git LFS Details

SHA256: 91db579092446be2a834bc67721a8e4346936f38c4edb912f459ca3e10f8f439
Pointer size: 131 Bytes
Size of remote file: 301 kB

assets/vben_vs_sota.png ADDED Viewed

Git LFS Details

SHA256: 9a0e86ca85046d2675f97984b88b6e74df07bba8a62a31ab8a1aef50d4eda44e
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

assets/video_dit_arch.jpg ADDED Viewed

Git LFS Details

SHA256: 195dceec6570289d8b01cc51d2e28a7786216f19de55b23978a52610d1646a66
Pointer size: 131 Bytes
Size of remote file: 643 kB

assets/video_vae_res.jpg ADDED Viewed

Git LFS Details

SHA256: d8f9e7f7353848056a615c8ef35ab86ec22976bb46cb27405008b4089701945c
Pointer size: 131 Bytes
Size of remote file: 213 kB

docs/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,157 @@

+# Changelog
+## 🔥 Latest News
+### June 11 2025: WanGP v5.5
+👋 *Hunyuan Video Custom Audio*: it is similar to Hunyuan Video Avatar excpet there isn't any lower limit on the number of frames and you can use your reference images in a different context than the image itself\
+*Hunyuan Video Custom Edit*: Hunyuan Video Controlnet, use it to do inpainting and replace a person in a video while still keeping his poses. Similar to Vace but less restricted than the Wan models in terms of content...
+### June 6 2025: WanGP v5.41
+👋 Bonus release: Support for **AccVideo** Lora to speed up x2 Video generations in Wan models. Check the Loras documentation to get the usage instructions of AccVideo.
+### June 6 2025: WanGP v5.4
+👋 World Exclusive : Hunyuan Video Avatar Support ! You won't need 80 GB of VRAM nor 32 GB oF VRAM, just 10 GB of VRAM will be sufficient to generate up to 15s of high quality speech / song driven Video at a high speed with no quality degradation. Support for TeaCache included.
+### May 26, 2025: WanGP v5.3
+👋 Happy with a Video generation and want to do more generations using the same settings but you can't remember what you did or you find it too hard to copy/paste one per one each setting from the file metadata? Rejoice! There are now multiple ways to turn this tedious process into a one click task:
+- Select one Video recently generated in the Video Gallery and click *Use Selected Video Settings*
+- Click *Drop File Here* and select a Video you saved somewhere, if the settings metadata have been saved with the Video you will be able to extract them automatically
+- Click *Export Settings to File* to save on your harddrive the current settings. You will be able to use them later again by clicking *Drop File Here* and select this time a Settings json file
+### May 23, 2025: WanGP v5.21
+👋 Improvements for Vace: better transitions between Sliding Windows, Support for Image masks in Matanyone, new Extend Video for Vace, different types of automated background removal
+### May 20, 2025: WanGP v5.2
+👋 Added support for Wan CausVid which is a distilled Wan model that can generate nice looking videos in only 4 to 12 steps. The great thing is that Kijai (Kudos to him!) has created a CausVid Lora that can be combined with any existing Wan t2v model 14B like Wan Vace 14B. See [LORAS.md](LORAS.md) for instructions on how to use CausVid.
+Also as an experiment I have added support for the MoviiGen, the first model that claims to be capable of generating 1080p videos (if you have enough VRAM (20GB...) and be ready to wait for a long time...). Don't hesitate to share your impressions on the Discord server.
+### May 18, 2025: WanGP v5.1
+👋 Bonus Day, added LTX Video 13B Distilled: generate in less than one minute, very high quality Videos!
+### May 17, 2025: WanGP v5.0
+👋 One App to Rule Them All! Added support for the other great open source architectures:
+- **Hunyuan Video**: text 2 video (one of the best, if not the best t2v), image 2 video and the recently released Hunyuan Custom (very good identity preservation when injecting a person into a video)
+- **LTX Video 13B** (released last week): very long video support and fast 720p generation. Wan GP version has been greatly optimized and reduced LTX Video VRAM requirements by 4!
+Also:
+- Added support for the best Control Video Model, released 2 days ago: Vace 14B
+- New Integrated prompt enhancer to increase the quality of the generated videos
+*You will need one more `pip install -r requirements.txt`*
+### May 5, 2025: WanGP v4.5
+👋 FantasySpeaking model, you can animate a talking head using a voice track. This works not only on people but also on objects. Also better seamless transitions between Vace sliding windows for very long videos. New high quality processing features (mixed 16/32 bits calculation and 32 bits VAE)
+### April 27, 2025: WanGP v4.4
+👋 Phantom model support, very good model to transfer people or objects into video, works quite well at 720p and with the number of steps > 30
+### April 25, 2025: WanGP v4.3
+👋 Added preview mode and support for Sky Reels v2 Diffusion Forcing for high quality "infinite length videos". Note that Skyreel uses causal attention that is only supported by Sdpa attention so even if you choose another type of attention, some of the processes will use Sdpa attention.
+### April 18, 2025: WanGP v4.2
+👋 FLF2V model support, official support from Wan for image2video start and end frames specialized for 720p.
+### April 17, 2025: WanGP v4.1
+👋 Recam Master model support, view a video from a different angle. The video to process must be at least 81 frames long and you should set at least 15 steps denoising to get good results.
+### April 13, 2025: WanGP v4.0
+👋 Lots of goodies for you!
+- A new UI, tabs were replaced by a Dropdown box to easily switch models
+- A new queuing system that lets you stack in a queue as many text2video, image2video tasks, ... as you want. Each task can rely on complete different generation parameters (different number of frames, steps, loras, ...). Many thanks to **Tophness** for being a big contributor on this new feature
+- Temporal upsampling (Rife) and spatial upsampling (Lanczos) for a smoother video (32 fps or 64 fps) and to enlarge your video by x2 or x4. Check these new advanced options.
+- Wan Vace Control Net support: with Vace you can inject in the scene people or objects, animate a person, perform inpainting or outpainting, continue a video, ... See [VACE.md](VACE.md) for introduction guide.
+- Integrated *Matanyone* tool directly inside WanGP so that you can create easily inpainting masks used in Vace
+- Sliding Window generation for Vace, create windows that can last dozens of seconds
+- New optimizations for old generation GPUs: Generate 5s (81 frames, 15 steps) of Vace 1.3B with only 5GB and in only 6 minutes on a RTX 2080Ti and 5s of t2v 14B in less than 10 minutes.
+### March 27, 2025
+👋 Added support for the new Wan Fun InP models (image2video). The 14B Fun InP has probably better end image support but unfortunately existing loras do not work so well with it. The great novelty is the Fun InP image2 1.3B model: Image 2 Video is now accessible to even lower hardware configuration. It is not as good as the 14B models but very impressive for its size. Many thanks to the VideoX-Fun team (https://github.com/aigc-apps/VideoX-Fun)
+### March 26, 2025
+👋 Good news! Official support for RTX 50xx please check the [installation instructions](INSTALLATION.md).
+### March 24, 2025: Wan2.1GP v3.2
+👋
+- Added Classifier-Free Guidance Zero Star. The video should match better the text prompt (especially with text2video) at no performance cost: many thanks to the **CFG Zero * Team**. Don't hesitate to give them a star if you appreciate the results: https://github.com/WeichenFan/CFG-Zero-star
+- Added back support for PyTorch compilation with Loras. It seems it had been broken for some time
+- Added possibility to keep a number of pregenerated videos in the Video Gallery (useful to compare outputs of different settings)
+*You will need one more `pip install -r requirements.txt`*
+### March 19, 2025: Wan2.1GP v3.1
+👋 Faster launch and RAM optimizations (should require less RAM to run)
+*You will need one more `pip install -r requirements.txt`*
+### March 18, 2025: Wan2.1GP v3.0
+👋
+- New Tab based interface, you can switch from i2v to t2v conversely without restarting the app
+- Experimental Dual Frames mode for i2v, you can also specify an End frame. It doesn't always work, so you will need a few attempts.
+- You can save default settings in the files *i2v_settings.json* and *t2v_settings.json* that will be used when launching the app (you can also specify the path to different settings files)
+- Slight acceleration with loras
+*You will need one more `pip install -r requirements.txt`*
+Many thanks to *Tophness* who created the framework (and did a big part of the work) of the multitabs and saved settings features
+### March 18, 2025: Wan2.1GP v2.11
+👋 Added more command line parameters to prefill the generation settings + customizable output directory and choice of type of metadata for generated videos. Many thanks to *Tophness* for his contributions.
+*You will need one more `pip install -r requirements.txt` to reflect new dependencies*
+### March 18, 2025: Wan2.1GP v2.1
+👋 More Loras!: added support for 'Safetensors' and 'Replicate' Lora formats.
+*You will need to refresh the requirements with a `pip install -r requirements.txt`*
+### March 17, 2025: Wan2.1GP v2.0
+👋 The Lora festival continues:
+- Clearer user interface
+- Download 30 Loras in one click to try them all (expand the info section)
+- Very easy to use Loras as now Lora presets can input the subject (or other needed terms) of the Lora so that you don't have to modify manually a prompt
+- Added basic macro prompt language to prefill prompts with different values. With one prompt template, you can generate multiple prompts.
+- New Multiple images prompts: you can now combine any number of images with any number of text prompts (need to launch the app with --multiple-images)
+- New command lines options to launch directly the 1.3B t2v model or the 14B t2v model
+### March 14, 2025: Wan2.1GP v1.7
+👋
+- Lora Fest special edition: very fast loading/unload of loras for those Loras collectors around. You can also now add/remove loras in the Lora folder without restarting the app.
+- Added experimental Skip Layer Guidance (advanced settings), that should improve the image quality at no extra cost. Many thanks to the *AmericanPresidentJimmyCarter* for the original implementation
+*You will need to refresh the requirements `pip install -r requirements.txt`*
+### March 13, 2025: Wan2.1GP v1.6
+👋 Better Loras support, accelerated loading Loras.
+*You will need to refresh the requirements `pip install -r requirements.txt`*
+### March 10, 2025: Wan2.1GP v1.5
+👋 Official Teacache support + Smart Teacache (find automatically best parameters for a requested speed multiplier), 10% speed boost with no quality loss, improved lora presets (they can now include prompts and comments to guide the user)
+### March 7, 2025: Wan2.1GP v1.4
+👋 Fix PyTorch compilation, now it is really 20% faster when activated
+### March 4, 2025: Wan2.1GP v1.3
+👋 Support for Image to Video with multiples images for different images/prompts combinations (requires *--multiple-images* switch), and added command line *--preload x* to preload in VRAM x MB of the main diffusion model if you find there is too much unused VRAM and you want to (slightly) accelerate the generation process.
+*If you upgrade you will need to do a `pip install -r requirements.txt` again.*
+### March 4, 2025: Wan2.1GP v1.2
+👋 Implemented tiling on VAE encoding and decoding. No more VRAM peaks at the beginning and at the end
+### March 3, 2025: Wan2.1GP v1.1
+👋 Added Tea Cache support for faster generations: optimization of kijai's implementation (https://github.com/kijai/ComfyUI-WanVideoWrapper/) of teacache (https://github.com/ali-vilab/TeaCache)
+### March 2, 2025: Wan2.1GP by DeepBeepMeep v1
+👋 Brings:
+- Support for all Wan including the Image to Video model
+- Reduced memory consumption by 2, with possibility to generate more than 10s of video at 720p with a RTX 4090 and 10s of video at 480p with less than 12GB of VRAM. Many thanks to REFLEx (https://github.com/thu-ml/RIFLEx) for their algorithm that allows generating nice looking video longer than 5s.
+- The usual perks: web interface, multiple generations, loras support, sage attention, auto download of models, ...
+## Original Wan Releases
+### February 25, 2025
+👋 We've released the inference code and weights of Wan2.1.
+### February 27, 2025
+👋 Wan2.1 has been integrated into [ComfyUI](https://comfyanonymous.github.io/ComfyUI_examples/wan/). Enjoy!

docs/CLI.md ADDED Viewed

	@@ -0,0 +1,226 @@

+--vace-1-3B--vace-1-3B# Command Line Reference
+This document covers all available command line options for WanGP.
+## Basic Usage
+```bash
+# Default launch
+python wgp.py
+# Specific model modes
+python wgp.py --i2v           # Image-to-video
+python wgp.py --t2v           # Text-to-video (default)
+python wgp.py --t2v-14B       # 14B text-to-video model
+python wgp.py --t2v-1-3B      # 1.3B text-to-video model
+python wgp.py --i2v-14B       # 14B image-to-video model
+python wgp.py --i2v-1-3B      # Fun InP 1.3B image-to-video model
+python wgp.py --vace-1-3B     # VACE ControlNet 1.3B model
+```
+## Model and Performance Options
+### Model Configuration
+```bash
+--quantize-transformer BOOL   # Enable/disable transformer quantization (default: True)
+--compile                     # Enable PyTorch compilation (requires Triton)
+--attention MODE              # Force attention mode: sdpa, flash, sage, sage2
+--profile NUMBER              # Performance profile 1-5 (default: 4)
+--preload NUMBER              # Preload N MB of diffusion model in VRAM
+--fp16                        # Force fp16 instead of bf16 models
+--gpu DEVICE                  # Run on specific GPU device (e.g., "cuda:1")
+```
+### Performance Profiles
+- **Profile 1**: Load entire current model in VRAM and keep all unused models in reserved RAM for fast VRAM tranfers
+- **Profile 2**: Load model parts as needed, keep all unused models in reserved RAM for fast VRAM tranfers
+- **Profile 3**: Load entire current model in VRAM (requires 24GB for 14B model)
+- **Profile 4**: Default and recommended, load model parts as needed, most flexible option
+- **Profile 5**: Minimum RAM usage
+### Memory Management
+```bash
+--perc-reserved-mem-max FLOAT # Max percentage of RAM for reserved memory (< 0.5)
+```
+## Lora Configuration
+```bash
+--lora-dir PATH              # Path to Wan t2v loras directory
+--lora-dir-i2v PATH          # Path to Wan i2v loras directory
+--lora-dir-hunyuan PATH      # Path to Hunyuan t2v loras directory
+--lora-dir-hunyuan-i2v PATH  # Path to Hunyuan i2v loras directory
+--lora-dir-ltxv PATH         # Path to LTX Video loras directory
+--lora-preset PRESET         # Load lora preset file (.lset) on startup
+--check-loras                # Filter incompatible loras (slower startup)
+```
+## Generation Settings
+### Basic Generation
+```bash
+--seed NUMBER                # Set default seed value
+--frames NUMBER              # Set default number of frames to generate
+--steps NUMBER               # Set default number of denoising steps
+--advanced                   # Launch with advanced mode enabled
+```
+### Advanced Generation
+```bash
+--teacache MULTIPLIER        # TeaCache speed multiplier: 0, 1.5, 1.75, 2.0, 2.25, 2.5
+```
+## Interface and Server Options
+### Server Configuration
+```bash
+--server-port PORT           # Gradio server port (default: 7860)
+--server-name NAME           # Gradio server name (default: localhost)
+--listen                     # Make server accessible on network
+--share                      # Create shareable HuggingFace URL for remote access
+--open-browser               # Open browser automatically when launching
+```
+### Interface Options
+```bash
+--lock-config                # Prevent modifying video engine configuration from interface
+--theme THEME_NAME           # UI theme: "default" or "gradio"
+```
+## File and Directory Options
+```bash
+--settings PATH              # Path to folder containing default settings for all models
+--verbose LEVEL              # Information level 0-2 (default: 1)
+```
+## Examples
+### Basic Usage Examples
+```bash
+# Launch with specific model and loras
+python wgp.py --t2v-14B --lora-preset mystyle.lset
+# High-performance setup with compilation
+python wgp.py --compile --attention sage2 --profile 3
+# Low VRAM setup
+python wgp.py --t2v-1-3B --profile 4 --attention sdpa
+# Multiple images with custom lora directory
+python wgp.py --i2v --multiple-images --lora-dir /path/to/shared/loras
+```
+### Server Configuration Examples
+```bash
+# Network accessible server
+python wgp.py --listen --server-port 8080
+# Shareable server with custom theme
+python wgp.py --share --theme gradio --open-browser
+# Locked configuration for public use
+python wgp.py --lock-config --share
+```
+### Advanced Performance Examples
+```bash
+# Maximum performance (requires high-end GPU)
+python wgp.py --compile --attention sage2 --profile 3 --preload 2000
+# Optimized for RTX 2080Ti
+python wgp.py --profile 4 --attention sdpa --teacache 2.0
+# Memory-efficient setup
+python wgp.py --fp16 --profile 4 --perc-reserved-mem-max 0.3
+```
+### TeaCache Configuration
+```bash
+# Different speed multipliers
+python wgp.py --teacache 1.5   # 1.5x speed, minimal quality loss
+python wgp.py --teacache 2.0   # 2x speed, some quality loss
+python wgp.py --teacache 2.5   # 2.5x speed, noticeable quality loss
+python wgp.py --teacache 0     # Disable TeaCache
+```
+## Attention Modes
+### SDPA (Default)
+```bash
+python wgp.py --attention sdpa
+```
+- Available by default with PyTorch
+- Good compatibility with all GPUs
+- Moderate performance
+### Sage Attention
+```bash
+python wgp.py --attention sage
+```
+- Requires Triton installation
+- 30% faster than SDPA
+- Small quality cost
+### Sage2 Attention
+```bash
+python wgp.py --attention sage2
+```
+- Requires Triton and SageAttention 2.x
+- 40% faster than SDPA
+- Best performance option
+### Flash Attention
+```bash
+python wgp.py --attention flash
+```
+- May require CUDA kernel compilation
+- Good performance
+- Can be complex to install on Windows
+## Troubleshooting Command Lines
+### Fallback to Basic Setup
+```bash
+# If advanced features don't work
+python wgp.py --attention sdpa --profile 4 --fp16
+```
+### Debug Mode
+```bash
+# Maximum verbosity for troubleshooting
+python wgp.py --verbose 2 --check-loras
+```
+### Memory Issue Debugging
+```bash
+# Minimal memory usage
+python wgp.py --profile 4 --attention sdpa --perc-reserved-mem-max 0.2
+```
+## Configuration Files
+### Settings Files
+Load custom settings:
+```bash
+python wgp.py --settings /path/to/settings/folder
+```
+### Lora Presets
+Create and share lora configurations:
+```bash
+# Load specific preset
+python wgp.py --lora-preset anime_style.lset
+# With custom lora directory
+python wgp.py --lora-preset mystyle.lset --lora-dir /shared/loras
+```
+## Environment Variables
+While not command line options, these environment variables can affect behavior:
+- `CUDA_VISIBLE_DEVICES` - Limit visible GPUs
+- `PYTORCH_CUDA_ALLOC_CONF` - CUDA memory allocation settings
+- `TRITON_CACHE_DIR` - Triton cache directory (for Sage attention)

docs/GETTING_STARTED.md ADDED Viewed

	@@ -0,0 +1,194 @@

+# Getting Started with WanGP
+This guide will help you get started with WanGP video generation quickly and easily.
+## Prerequisites
+Before starting, ensure you have:
+- A compatible GPU (RTX 10XX or newer recommended)
+- Python 3.10.9 installed
+- At least 6GB of VRAM for basic models
+- Internet connection for model downloads
+## Quick Setup
+### Option 1: One-Click Installation (Recommended)
+Use [Pinokio App](https://pinokio.computer/) for the easiest installation experience.
+### Option 2: Manual Installation
+```bash
+git clone https://github.com/deepbeepmeep/Wan2GP.git
+cd Wan2GP
+conda create -n wan2gp python=3.10.9
+conda activate wan2gp
+pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
+pip install -r requirements.txt
+```
+For detailed installation instructions, see [INSTALLATION.md](INSTALLATION.md).
+## First Launch
+### Basic Launch
+```bash
+python wgp.py
+```
+This launches the WanGP generator with default settings. You will be able to pick from a Drop Down menu which model you want to use.
+### Alternative Modes
+```bash
+python wgp.py --i2v        # Wan Image-to-video mode
+python wgp.py --t2v-1-3B   # Wan Smaller, faster model
+```
+## Understanding the Interface
+When you launch WanGP, you'll see a web interface with several sections:
+### Main Generation Panel
+- **Model Selection**: Dropdown to choose between different models
+- **Prompt**: Text description of what you want to generate
+- **Generate Button**: Start the video generation process
+### Advanced Settings (click checkbox to enable)
+- **Generation Settings**: Steps, guidance, seeds
+- **Loras**: Additional style customizations
+- **Sliding Window**: For longer videos
+## Your First Video
+Let's generate a simple text-to-video:
+1. **Launch WanGP**: `python wgp.py`
+2. **Open Browser**: Navigate to `http://localhost:7860`
+3. **Enter Prompt**: "A cat walking in a garden"
+4. **Click Generate**: Wait for the video to be created
+5. **View Result**: The video will appear in the output section
+### Recommended First Settings
+- **Model**: Wan 2.1 text2video 1.3B (faster, lower VRAM)
+- **Frames**: 49 (about 2 seconds)
+- **Steps**: 20 (good balance of speed/quality)
+## Model Selection
+### Text-to-Video Models
+- **Wan 2.1 T2V 1.3B**: Fastest, lowest VRAM (6GB), good quality
+- **Wan 2.1 T2V 14B**: Best quality, requires more VRAM (12GB+)
+- **Hunyuan Video**: Excellent quality, slower generation
+- **LTX Video**: Good for longer videos
+### Image-to-Video Models
+- **Wan Fun InP 1.3B**: Fast image animation
+- **Wan Fun InP 14B**: Higher quality image animation
+- **VACE**: Advanced control over video generation
+### Choosing the Right Model
+- **Low VRAM (6-8GB)**: Use 1.3B models
+- **Medium VRAM (10-12GB)**: Use 14B models or Hunyuan
+- **High VRAM (16GB+)**: Any model, longer videos
+## Basic Settings Explained
+### Generation Settings
+- **Frames**: Number of frames (more = longer video)
+  - 25 frames ≈ 1 second
+  - 49 frames ≈ 2 seconds
+  - 73 frames ≈ 3 seconds
+- **Steps**: Quality vs Speed tradeoff
+  - 15 steps: Fast, lower quality
+  - 20 steps: Good balance
+  - 30+ steps: High quality, slower
+- **Guidance Scale**: How closely to follow the prompt
+  - 3-5: More creative interpretation
+  - 7-10: Closer to prompt description
+  - 12+: Very literal interpretation
+### Seeds
+- **Random Seed**: Different result each time
+- **Fixed Seed**: Reproducible results
+- **Use same seed + prompt**: Generate variations
+## Common Beginner Issues
+### "Out of Memory" Errors
+1. Use smaller models (1.3B instead of 14B)
+2. Reduce frame count
+3. Lower resolution in advanced settings
+4. Enable quantization (usually on by default)
+### Slow Generation
+1. Use 1.3B models for speed
+2. Reduce number of steps
+3. Install Sage attention (see [INSTALLATION.md](INSTALLATION.md))
+4. Enable TeaCache: `python wgp.py --teacache 2.0`
+### Poor Quality Results
+1. Increase number of steps (25-30)
+2. Improve prompt description
+3. Use 14B models if you have enough VRAM
+4. Enable Skip Layer Guidance in advanced settings
+## Writing Good Prompts
+### Basic Structure
+```
+[Subject] [Action] [Setting] [Style/Quality modifiers]
+```
+### Examples
+```
+A red sports car driving through a mountain road at sunset, cinematic, high quality
+A woman with long hair walking on a beach, waves in the background, realistic, detailed
+A cat sitting on a windowsill watching rain, cozy atmosphere, soft lighting
+```
+### Tips
+- Be specific about what you want
+- Include style descriptions (cinematic, realistic, etc.)
+- Mention lighting and atmosphere
+- Describe the setting in detail
+- Use quality modifiers (high quality, detailed, etc.)
+## Next Steps
+Once you're comfortable with basic generation:
+1. **Explore Advanced Features**:
+   - [Loras Guide](LORAS.md) - Customize styles and characters
+   - [VACE ControlNet](VACE.md) - Advanced video control
+   - [Command Line Options](CLI.md) - Optimize performance
+2. **Improve Performance**:
+   - Install better attention mechanisms
+   - Optimize memory settings
+   - Use compilation for speed
+3. **Join the Community**:
+   - [Discord Server](https://discord.gg/g7efUW9jGV) - Get help and share videos
+   - Share your best results
+   - Learn from other users
+## Troubleshooting First Steps
+### Installation Issues
+- Ensure Python 3.10.9 is used
+- Check CUDA version compatibility
+- See [INSTALLATION.md](INSTALLATION.md) for detailed steps
+### Generation Issues
+- Check GPU compatibility
+- Verify sufficient VRAM
+- Try basic settings first
+- See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for specific issues
+### Performance Issues
+- Use appropriate model for your hardware
+- Enable performance optimizations
+- Check [CLI.md](CLI.md) for optimization flags
+Remember: Start simple and gradually explore more advanced features as you become comfortable with the basics!

docs/INSTALLATION.md ADDED Viewed

	@@ -0,0 +1,170 @@

+# Installation Guide
+This guide covers installation for different GPU generations and operating systems.
+## Requirements
+- Python 3.10.9
+- Conda or Python venv
+- Compatible GPU (RTX 10XX or newer recommended)
+## Installation for RTX 10XX to RTX 40XX (Stable)
+This installation uses PyTorch 2.6.0 which is well-tested and stable.
+### Step 1: Download and Setup Environment
+```shell
+# Clone the repository
+git clone https://github.com/deepbeepmeep/Wan2GP.git
+cd Wan2GP
+# Create Python 3.10.9 environment using conda
+conda create -n wan2gp python=3.10.9
+conda activate wan2gp
+```
+### Step 2: Install PyTorch
+```shell
+# Install PyTorch 2.6.0 with CUDA 12.4
+pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
+```
+### Step 3: Install Dependencies
+```shell
+# Install core dependencies
+pip install -r requirements.txt
+```
+### Step 4: Optional Performance Optimizations
+#### Sage Attention (30% faster)
+```shell
+# Windows only: Install Triton
+pip install triton-windows
+# For both Windows and Linux
+pip install sageattention==1.0.6
+```
+#### Sage 2 Attention (40% faster)
+```shell
+# Windows
+pip install triton-windows
+pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu126torch2.6.0-cp310-cp310-win_amd64.whl
+# Linux (manual compilation required)
+git clone https://github.com/thu-ml/SageAttention
+cd SageAttention
+pip install -e .
+```
+#### Flash Attention
+```shell
+# May require CUDA kernel compilation on Windows
+pip install flash-attn==2.7.2.post1
+```
+## Installation for RTX 50XX (Beta)
+RTX 50XX GPUs require PyTorch 2.7.0 (beta). This version may be less stable.
+⚠️ **Important:** Use Python 3.10 for compatibility with pip wheels.
+### Step 1: Setup Environment
+```shell
+# Clone and setup (same as above)
+git clone https://github.com/deepbeepmeep/Wan2GP.git
+cd Wan2GP
+conda create -n wan2gp python=3.10.9
+conda activate wan2gp
+```
+### Step 2: Install PyTorch Beta
+```shell
+# Install PyTorch 2.7.0 with CUDA 12.8
+pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
+```
+### Step 3: Install Dependencies
+```shell
+pip install -r requirements.txt
+```
+### Step 4: Optional Optimizations for RTX 50XX
+#### Sage Attention
+```shell
+# Windows
+pip install triton-windows
+pip install sageattention==1.0.6
+# Linux
+pip install sageattention==1.0.6
+```
+#### Sage 2 Attention
+```shell
+# Windows
+pip install triton-windows
+pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu128torch2.7.0-cp310-cp310-win_amd64.whl
+# Linux (manual compilation)
+git clone https://github.com/thu-ml/SageAttention
+cd SageAttention
+pip install -e .
+```
+## Attention Modes
+WanGP supports several attention implementations:
+- **SDPA** (default): Available by default with PyTorch
+- **Sage**: 30% speed boost with small quality cost
+- **Sage2**: 40% speed boost
+- **Flash**: Good performance, may be complex to install on Windows
+## Performance Profiles
+Choose a profile based on your hardware:
+- **Profile 3 (LowRAM_HighVRAM)**: Loads entire model in VRAM, requires 24GB VRAM for 8-bit quantized 14B model
+- **Profile 4 (LowRAM_LowVRAM)**: Default, loads model parts as needed, slower but lower VRAM requirement
+## Troubleshooting
+### Sage Attention Issues
+If Sage attention doesn't work:
+1. Check if Triton is properly installed
+2. Clear Triton cache
+3. Fallback to SDPA attention:
+   ```bash
+   python wgp.py --attention sdpa
+   ```
+### Memory Issues
+- Use lower resolution or shorter videos
+- Enable quantization (default)
+- Use Profile 4 for lower VRAM usage
+- Consider using 1.3B models instead of 14B models
+### GPU Compatibility
+- RTX 10XX, 20XX: Supported with SDPA attention
+- RTX 30XX, 40XX: Full feature support
+- RTX 50XX: Beta support with PyTorch 2.7.0
+For more troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md)

docs/LORAS.md ADDED Viewed

	@@ -0,0 +1,224 @@

+# Loras Guide
+Loras (Low-Rank Adaptations) allow you to customize video generation models by adding specific styles, characters, or effects to your videos.
+## Directory Structure
+Loras are organized in different folders based on the model they're designed for:
+### Text-to-Video Models
+- `loras/` - General t2v loras
+- `loras/1.3B/` - Loras specifically for 1.3B models
+- `loras/14B/` - Loras specifically for 14B models
+### Image-to-Video Models
+- `loras_i2v/` - Image-to-video loras
+### Other Models
+- `loras_hunyuan/` - Hunyuan Video t2v loras
+- `loras_hunyuan_i2v/` - Hunyuan Video i2v loras
+- `loras_ltxv/` - LTX Video loras
+## Custom Lora Directory
+You can specify custom lora directories when launching the app:
+```bash
+# Use shared lora directory for both t2v and i2v
+python wgp.py --lora-dir /path/to/shared/loras --lora-dir-i2v /path/to/shared/loras
+# Specify different directories for different models
+python wgp.py --lora-dir-hunyuan /path/to/hunyuan/loras --lora-dir-ltxv /path/to/ltx/loras
+```
+## Using Loras
+### Basic Usage
+1. Place your lora files in the appropriate directory
+2. Launch WanGP
+3. In the Advanced Tab, select the "Loras" section
+4. Check the loras you want to activate
+5. Set multipliers for each lora (default is 1.0)
+### Lora Multipliers
+Multipliers control the strength of each lora's effect:
+#### Simple Multipliers
+```
+1.2 0.8
+```
+- First lora: 1.2 strength
+- Second lora: 0.8 strength
+#### Time-based Multipliers
+For dynamic effects over generation steps, use comma-separated values:
+```
+0.9,0.8,0.7
+1.2,1.1,1.0
+```
+- For 30 steps: steps 0-9 use first value, 10-19 use second, 20-29 use third
+- First lora: 0.9 → 0.8 → 0.7
+- Second lora: 1.2 → 1.1 → 1.0
+## Lora Presets
+Presets are combinations of loras with predefined multipliers and prompts.
+### Creating Presets
+1. Configure your loras and multipliers
+2. Write a prompt with comments (lines starting with #)
+3. Save as a preset with `.lset` extension
+### Example Preset
+```
+# Use the keyword "ohnvx" to trigger the lora
+A ohnvx character is driving a car through the city
+```
+### Using Presets
+```bash
+# Load preset on startup
+python wgp.py --lora-preset mypreset.lset
+```
+### Managing Presets
+- Edit, save, or delete presets directly from the web interface
+- Presets include comments with usage instructions
+- Share `.lset` files with other users
+## CausVid Lora (Video Generation Accelerator)
+CausVid is a distilled Wan model that generates videos in 4-12 steps with 2x speed improvement.
+### Setup Instructions
+1. Download the CausVid Lora:
+   ```
+   https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors
+   ```
+2. Place in your `loras/` directory
+### Usage
+1. Select a Wan t2v model (e.g., Wan 2.1 text2video 13B or Vace 13B)
+2. Enable Advanced Mode
+3. In Advanced Generation Tab:
+   - Set Guidance Scale = 1
+   - Set Shift Scale = 7
+4. In Advanced Lora Tab:
+   - Select CausVid Lora
+   - Set multiplier to 0.3
+5. Set generation steps to 12
+6. Generate!
+### CausVid Step/Multiplier Relationship
+- **12 steps**: 0.3 multiplier (recommended)
+- **8 steps**: 0.5-0.7 multiplier
+- **4 steps**: 0.8-1.0 multiplier
+*Note: Lower steps = lower quality (especially motion)*
+## Supported Formats
+WanGP supports multiple lora formats:
+- **Safetensors** (.safetensors)
+- **Replicate** format
+- **Standard PyTorch** (.pt, .pth)
+## AccVid Lora (Video Generation Accelerator)
+AccVid is a distilled Wan model that generates videos with a 2x speed improvement since classifier free guidance is no longer needed (that is cfg = 1).
+### Setup Instructions
+1. Download the CausVid Lora:
+- for t2v models:
+   ```
+   https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_AccVid_T2V_14B_lora_rank32_fp16.safetensors
+   ```
+- for i2v models:
+   ```
+   https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_AccVid_I2V_480P_14B_lora_rank32_fp16.safetensors
+   ```
+2. Place in your `loras/` directory or `loras_i2v/` directory
+### Usage
+1. Select a Wan t2v model (e.g., Wan 2.1 text2video 13B or Vace 13B) or Wan i2v model
+2. Enable Advanced Mode
+3. In Advanced Generation Tab:
+   - Set Guidance Scale = 1
+   - Set Shift Scale = 5
+4. The number steps remain unchanged compared to what you would use with the original model but it will be two times faster since classifier free guidance is not needed
+## Performance Tips
+### Fast Loading/Unloading
+- Loras can be added/removed without restarting the app
+- Use the "Refresh" button to detect new loras
+- Enable `--check-loras` to filter incompatible loras (slower startup)
+### Memory Management
+- Loras are loaded on-demand to save VRAM
+- Multiple loras can be used simultaneously
+- Time-based multipliers don't use extra memory
+## Finding Loras
+### Sources
+- **[Civitai](https://civitai.com/)** - Large community collection
+- **HuggingFace** - Official and community loras
+- **Discord Server** - Community recommendations
+### Creating Loras
+- **Kohya** - Popular training tool
+- **OneTrainer** - Alternative training solution
+- **Custom datasets** - Train on your own content
+## Macro System (Advanced)
+Create multiple prompts from templates using macros:
+```
+! {Subject}="cat","woman","man", {Location}="forest","lake","city", {Possessive}="its","her","his"
+In the video, a {Subject} is presented. The {Subject} is in a {Location} and looks at {Possessive} watch.
+```
+This generates:
+1. "In the video, a cat is presented. The cat is in a forest and looks at its watch."
+2. "In the video, a woman is presented. The woman is in a lake and looks at her watch."
+3. "In the video, a man is presented. The man is in a city and looks at his watch."
+## Troubleshooting
+### Lora Not Working
+1. Check if lora is compatible with your model size (1.3B vs 14B)
+2. Verify lora format is supported
+3. Try different multiplier values
+4. Check the lora was trained for your model type (t2v vs i2v)
+### Performance Issues
+1. Reduce number of active loras
+2. Lower multiplier values
+3. Use `--check-loras` to filter incompatible files
+4. Clear lora cache if issues persist
+### Memory Errors
+1. Use fewer loras simultaneously
+2. Reduce model size (use 1.3B instead of 14B)
+3. Lower video resolution or frame count
+4. Enable quantization if not already active
+## Command Line Options
+```bash
+# Lora-related command line options
+--lora-dir path                    # Path to t2v loras directory
+--lora-dir-i2v path               # Path to i2v loras directory
+--lora-dir-hunyuan path           # Path to Hunyuan t2v loras
+--lora-dir-hunyuan-i2v path       # Path to Hunyuan i2v loras
+--lora-dir-ltxv path              # Path to LTX Video loras
+--lora-preset preset              # Load preset on startup
+--check-loras                     # Filter incompatible loras
+```

docs/MODELS.md ADDED Viewed

	@@ -0,0 +1,268 @@

+# Models Overview
+WanGP supports multiple video generation models, each optimized for different use cases and hardware configurations.
+## Wan 2.1 Text2Video Models
+Please note that that the term *Text2Video* refers to the underlying Wan architecture but as it has been greatly improved overtime many derived Text2Video models can now  generate videos using images.
+#### Wan 2.1 Text2Video 1.3B
+- **Size**: 1.3 billion parameters
+- **VRAM**: 6GB minimum
+- **Speed**: Fast generation
+- **Quality**: Good quality for the size
+- **Best for**: Quick iterations, lower-end hardware
+- **Command**: `python wgp.py --t2v-1-3B`
+#### Wan 2.1 Text2Video 14B
+- **Size**: 14 billion parameters
+- **VRAM**: 12GB+ recommended
+- **Speed**: Slower but higher quality
+- **Quality**: Excellent detail and coherence
+- **Best for**: Final production videos
+- **Command**: `python wgp.py --t2v-14B`
+#### Wan Vace 1.3B
+- **Type**: ControlNet for advanced video control
+- **VRAM**: 6GB minimum
+- **Features**: Motion transfer, object injection, inpainting
+- **Best for**: Advanced video manipulation
+- **Command**: `python wgp.py --vace-1.3B`
+#### Wan Vace 14B
+- **Type**: Large ControlNet model
+- **VRAM**: 12GB+ recommended
+- **Features**: All Vace features with higher quality
+- **Best for**: Professional video editing workflows
+#### MoviiGen (Experimental)
+- **Resolution**: Claims 1080p capability
+- **VRAM**: 20GB+ required
+- **Speed**: Very slow generation
+- **Features**: Should generate cinema like video, specialized for 2.1 / 1 ratios
+- **Status**: Experimental, feedback welcome
+<BR>
+## Wan 2.1 Image-to-Video Models
+#### Wan 2.1 Image2Video 14B
+- **Size**: 14 billion parameters
+- **VRAM**: 12GB+ recommended
+- **Speed**: Slower but higher quality
+- **Quality**: Excellent detail and coherence
+- **Best for**: Most Loras available work with this model
+- **Command**: `python wgp.py --i2v-14B`
+#### FLF2V
+- **Type**: Start/end frame specialist
+- **Resolution**: Optimized for 720p
+- **Official**: Wan team supported
+- **Use case**: Image-to-video with specific endpoints
+<BR>
+## Wan 2.1 Specialized Models
+#### FantasySpeaking
+- **Type**: Talking head animation
+- **Input**: Voice track + image
+- **Works on**: People and objects
+- **Use case**: Lip-sync and voice-driven animation
+#### Phantom
+- **Type**: Person/object transfer
+- **Resolution**: Works well at 720p
+- **Requirements**: 30+ steps for good results
+- **Best for**: Transferring subjects between videos
+#### Recam Master
+- **Type**: Viewpoint change
+- **Requirements**: 81+ frame input videos, 15+ denoising steps
+- **Use case**: View same scene from different angles
+#### Sky Reels v2
+- **Type**: Diffusion Forcing model
+- **Specialty**: "Infinite length" videos
+- **Features**: High quality continuous generation
+<BR>
+## Wan Fun InP Models
+#### Wan Fun InP 1.3B
+- **Size**: 1.3 billion parameters
+- **VRAM**: 6GB minimum
+- **Quality**: Good for the size, accessible to lower hardware
+- **Best for**: Entry-level image animation
+- **Command**: `python wgp.py --i2v-1-3B`
+#### Wan Fun InP 14B
+- **Size**: 14 billion parameters
+- **VRAM**: 12GB+ recommended
+- **Quality**: Better end image support
+- **Limitation**: Existing loras don't work as well
+<BR>
+## Wan Special Loras
+### Causvid
+- **Type**: Distilled model (Lora implementation)
+- **Speed**: 4-12 steps generation, 2x faster
+- **Compatible**: Works with Wan 14B models
+- **Setup**: Requires CausVid Lora (see [LORAS.md](LORAS.md))
+<BR>
+## Hunyuan Video Models
+#### Hunyuan Video Text2Video
+- **Quality**: Among the best open source t2v models
+- **VRAM**: 12GB+ recommended
+- **Speed**: Slower generation but excellent results
+- **Features**: Superior text adherence and video quality, up to 10s of video
+- **Best for**: High-quality text-to-video generation
+#### Hunyuan Video Custom
+- **Specialty**: Identity preservation
+- **Use case**: Injecting specific people into videos
+- **Quality**: Excellent for character consistency
+- **Best for**: Character-focused video generation
+#### Hunyuan Video Avater
+- **Specialty**: Generate up to 15s of high quality speech / song driven Video .
+- **Use case**: Injecting specific people into videos
+- **Quality**: Excellent for character consistency
+- **Best for**: Character-focused video generation, Video synchronized with voice
+<BR>
+## LTX Video Models
+#### LTX Video 13B
+- **Specialty**: Long video generation
+- **Resolution**: Fast 720p generation
+- **VRAM**: Optimized by WanGP (4x reduction in requirements)
+- **Best for**: Longer duration videos
+#### LTX Video 13B Distilled
+- **Speed**: Generate in less than one minute
+- **Quality**: Very high quality despite speed
+- **Best for**: Rapid prototyping and quick results
+<BR>
+## Model Selection Guide
+### By Hardware (VRAM)
+#### 6-8GB VRAM
+- Wan 2.1 T2V 1.3B
+- Wan Fun InP 1.3B
+- Wan Vace 1.3B
+#### 10-12GB VRAM
+- Wan 2.1 T2V 14B
+- Wan Fun InP 14B
+- Hunyuan Video (with optimizations)
+- LTX Video 13B
+#### 16GB+ VRAM
+- All models supported
+- Longer videos possible
+- Higher resolutions
+- Multiple simultaneous Loras
+#### 20GB+ VRAM
+- MoviiGen (experimental 1080p)
+- Very long videos
+- Maximum quality settings
+### By Use Case
+#### Quick Prototyping
+1. **LTX Video 13B Distilled** - Fastest, high quality
+2. **Wan 2.1 T2V 1.3B** - Fast, good quality
+3. **CausVid Lora** - 4-12 steps, very fast
+#### Best Quality
+1. **Hunyuan Video** - Overall best t2v quality
+2. **Wan 2.1 T2V 14B** - Excellent Wan quality
+3. **Wan Vace 14B** - Best for controlled generation
+#### Advanced Control
+1. **Wan Vace 14B/1.3B** - Motion transfer, object injection
+2. **Phantom** - Person/object transfer
+3. **FantasySpeaking** - Voice-driven animation
+#### Long Videos
+1. **LTX Video 13B** - Specialized for length
+2. **Sky Reels v2** - Infinite length videos
+3. **Wan Vace + Sliding Windows** - Up to 1 minute
+#### Lower Hardware
+1. **Wan Fun InP 1.3B** - Image-to-video
+2. **Wan 2.1 T2V 1.3B** - Text-to-video
+3. **Wan Vace 1.3B** - Advanced control
+<BR>
+## Performance Comparison
+### Speed (Relative)
+1. **CausVid Lora** (4-12 steps) - Fastest
+2. **LTX Video Distilled** - Very fast
+3. **Wan 1.3B models** - Fast
+4. **Wan 14B models** - Medium
+5. **Hunyuan Video** - Slower
+6. **MoviiGen** - Slowest
+### Quality (Subjective)
+1. **Hunyuan Video** - Highest overall
+2. **Wan 14B models** - Excellent
+3. **LTX Video models** - Very good
+4. **Wan 1.3B models** - Good
+5. **CausVid** - Good (varies with steps)
+### VRAM Efficiency
+1. **Wan 1.3B models** - Most efficient
+2. **LTX Video** (with WanGP optimizations)
+3. **Wan 14B models**
+4. **Hunyuan Video**
+5. **MoviiGen** - Least efficient
+<BR>
+## Model Switching
+WanGP allows switching between models without restarting:
+1. Use the dropdown menu in the web interface
+2. Models are loaded on-demand
+3. Previous model is unloaded to save VRAM
+4. Settings are preserved when possible
+<BR>
+## Tips for Model Selection
+### First Time Users
+Start with **Wan 2.1 T2V 1.3B** to learn the interface and test your hardware.
+### Production Work
+Use **Hunyuan Video** or **Wan 14B** models for final output quality.
+### Experimentation
+**CausVid Lora** or **LTX Distilled** for rapid iteration and testing.
+### Specialized Tasks
+- **VACE** for advanced control
+- **FantasySpeaking** for talking heads
+- **LTX Video** for long sequences
+### Hardware Optimization
+Always start with the largest model your VRAM can handle, then optimize settings for speed vs quality based on your needs.

docs/TROUBLESHOOTING.md ADDED Viewed

	@@ -0,0 +1,338 @@

+# Troubleshooting Guide
+This guide covers common issues and their solutions when using WanGP.
+## Installation Issues
+### PyTorch Installation Problems
+#### CUDA Version Mismatch
+**Problem**: PyTorch can't detect GPU or CUDA errors
+**Solution**:
+```bash
+# Check your CUDA version
+nvidia-smi
+# Install matching PyTorch version
+# For CUDA 12.4 (RTX 10XX-40XX)
+pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
+# For CUDA 12.8 (RTX 50XX)
+pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
+```
+#### Python Version Issues
+**Problem**: Package compatibility errors
+**Solution**: Ensure you're using Python 3.10.9
+```bash
+python --version  # Should show 3.10.9
+conda create -n wan2gp python=3.10.9
+```
+### Dependency Installation Failures
+#### Triton Installation (Windows)
+**Problem**: `pip install triton-windows` fails
+**Solution**:
+1. Update pip: `pip install --upgrade pip`
+2. Try pre-compiled wheel
+3. Fallback to SDPA attention: `python wgp.py --attention sdpa`
+#### SageAttention Compilation Issues
+**Problem**: SageAttention installation fails
+**Solution**:
+1. Install Visual Studio Build Tools (Windows)
+2. Use pre-compiled wheels when available
+3. Fallback to basic attention modes
+## Memory Issues
+### CUDA Out of Memory
+#### During Model Loading
+**Problem**: "CUDA out of memory" when loading model
+**Solutions**:
+```bash
+# Use smaller model
+python wgp.py --t2v-1-3B
+# Enable quantization (usually default)
+python wgp.py --quantize-transformer True
+# Use memory-efficient profile
+python wgp.py --profile 4
+# Reduce preloaded model size
+python wgp.py --preload 0
+```
+#### During Video Generation
+**Problem**: Memory error during generation
+**Solutions**:
+1. Reduce frame count (shorter videos)
+2. Lower resolution in advanced settings
+3. Use lower batch size
+4. Clear GPU cache between generations
+### System RAM Issues
+#### High RAM Usage
+**Problem**: System runs out of RAM
+**Solutions**:
+```bash
+# Limit reserved memory
+python wgp.py --perc-reserved-mem-max 0.3
+# Use minimal RAM profile
+python wgp.py --profile 5
+# Enable swap file (OS level)
+```
+## Performance Issues
+### Slow Generation Speed
+#### General Optimization
+```bash
+# Enable compilation (requires Triton)
+python wgp.py --compile
+# Use faster attention
+python wgp.py --attention sage2
+# Enable TeaCache
+python wgp.py --teacache 2.0
+# Use high-performance profile
+python wgp.py --profile 3
+```
+#### GPU-Specific Optimizations
+**RTX 10XX/20XX Series**:
+```bash
+python wgp.py --attention sdpa --profile 4 --teacache 1.5
+```
+**RTX 30XX/40XX Series**:
+```bash
+python wgp.py --compile --attention sage --profile 3 --teacache 2.0
+```
+**RTX 50XX Series**:
+```bash
+python wgp.py --attention sage --profile 4 --fp16
+```
+### Attention Mechanism Issues
+#### Sage Attention Not Working
+**Problem**: Sage attention fails to compile or work
+**Diagnostic Steps**:
+1. Check Triton installation:
+   ```python
+   import triton
+   print(triton.__version__)
+   ```
+2. Clear Triton cache:
+   ```bash
+   # Windows
+   rmdir /s %USERPROFILE%\.triton
+   # Linux
+   rm -rf ~/.triton
+   ```
+3. Fallback solution:
+   ```bash
+   python wgp.py --attention sdpa
+   ```
+#### Flash Attention Issues
+**Problem**: Flash attention compilation fails
+**Solution**:
+- Windows: Often requires manual CUDA kernel compilation
+- Linux: Usually works with `pip install flash-attn`
+- Fallback: Use Sage or SDPA attention
+## Model-Specific Issues
+### Lora Problems
+#### Loras Not Loading
+**Problem**: Loras don't appear in the interface
+**Solutions**:
+1. Check file format (should be .safetensors, .pt, or .pth)
+2. Verify correct directory:
+   ```
+   loras/          # For t2v models
+   loras_i2v/      # For i2v models
+   loras_hunyuan/  # For Hunyuan models
+   ```
+3. Click "Refresh" button in interface
+4. Use `--check-loras` to filter incompatible files
+#### Lora Compatibility Issues
+**Problem**: Lora causes errors or poor results
+**Solutions**:
+1. Check model size compatibility (1.3B vs 14B)
+2. Verify lora was trained for your model type
+3. Try different multiplier values
+4. Use `--check-loras` flag to auto-filter
+### VACE-Specific Issues
+#### Poor VACE Results
+**Problem**: VACE generates poor quality or unexpected results
+**Solutions**:
+1. Enable Skip Layer Guidance
+2. Use detailed prompts describing all elements
+3. Ensure proper mask creation with Matanyone
+4. Check reference image quality
+5. Use at least 15 steps, preferably 30+
+#### Matanyone Tool Issues
+**Problem**: Mask creation difficulties
+**Solutions**:
+1. Use negative point prompts to refine selection
+2. Create multiple sub-masks and combine them
+3. Try different background removal options
+4. Ensure sufficient contrast in source video
+## Network and Server Issues
+### Gradio Interface Problems
+#### Port Already in Use
+**Problem**: "Port 7860 is already in use"
+**Solution**:
+```bash
+# Use different port
+python wgp.py --server-port 7861
+# Or kill existing process
+# Windows
+netstat -ano | findstr :7860
+taskkill /PID <PID> /F
+# Linux
+lsof -i :7860
+kill <PID>
+```
+#### Interface Not Loading
+**Problem**: Browser shows "connection refused"
+**Solutions**:
+1. Check if server started successfully
+2. Try `http://127.0.0.1:7860` instead of `localhost:7860`
+3. Disable firewall temporarily
+4. Use `--listen` flag for network access
+### Remote Access Issues
+#### Sharing Not Working
+**Problem**: `--share` flag doesn't create public URL
+**Solutions**:
+1. Check internet connection
+2. Try different network
+3. Use `--listen` with port forwarding
+4. Check firewall settings
+## Quality Issues
+### Poor Video Quality
+#### General Quality Improvements
+1. Increase number of steps (25-30+)
+2. Use larger models (14B instead of 1.3B)
+3. Enable Skip Layer Guidance
+4. Improve prompt descriptions
+5. Use higher resolution settings
+#### Specific Quality Issues
+**Blurry Videos**:
+- Increase steps
+- Check source image quality (i2v)
+- Reduce TeaCache multiplier
+- Use higher guidance scale
+**Inconsistent Motion**:
+- Use longer overlap in sliding windows
+- Reduce window size
+- Improve prompt consistency
+- Check control video quality (VACE)
+**Color Issues**:
+- Check model compatibility
+- Adjust guidance scale
+- Verify input image color space
+- Try different VAE settings
+## Advanced Debugging
+### Enable Verbose Output
+```bash
+# Maximum verbosity
+python wgp.py --verbose 2
+# Check lora compatibility
+python wgp.py --check-loras --verbose 2
+```
+### Memory Debugging
+```bash
+# Monitor GPU memory
+nvidia-smi -l 1
+# Reduce memory usage
+python wgp.py --profile 4 --perc-reserved-mem-max 0.2
+```
+### Performance Profiling
+```bash
+# Test different configurations
+python wgp.py --attention sdpa --profile 4  # Baseline
+python wgp.py --attention sage --profile 3  # Performance
+python wgp.py --compile --teacache 2.0      # Maximum speed
+```
+## Getting Help
+### Before Asking for Help
+1. Check this troubleshooting guide
+2. Read the relevant documentation:
+   - [Installation Guide](INSTALLATION.md)
+   - [Getting Started](GETTING_STARTED.md)
+   - [Command Line Reference](CLI.md)
+3. Try basic fallback configuration:
+   ```bash
+   python wgp.py --attention sdpa --profile 4
+   ```
+### Community Support
+- **Discord Server**: https://discord.gg/g7efUW9jGV
+- Provide relevant information:
+  - GPU model and VRAM amount
+  - Python and PyTorch versions
+  - Complete error messages
+  - Command used to launch WanGP
+  - Operating system
+### Reporting Bugs
+When reporting issues:
+1. Include system specifications
+2. Provide complete error logs
+3. List the exact steps to reproduce
+4. Mention any modifications to default settings
+5. Include command line arguments used
+## Emergency Fallback
+If nothing works, try this minimal configuration:
+```bash
+# Absolute minimum setup
+python wgp.py --t2v-1-3B --attention sdpa --profile 4 --teacache 0 --fp16
+# If that fails, check basic PyTorch installation
+python -c "import torch; print(torch.cuda.is_available())"
+```

docs/VACE.md ADDED Viewed

	@@ -0,0 +1,190 @@

+# VACE ControlNet Guide
+VACE is a powerful ControlNet that enables Video-to-Video and Reference-to-Video generation. It allows you to inject your own images into output videos, animate characters, perform inpainting/outpainting, and continue videos.
+## Overview
+VACE is probably one of the most powerful Wan models available. With it, you can:
+- Inject people or objects into scenes
+- Animate characters
+- Perform video inpainting and outpainting
+- Continue existing videos
+- Transfer motion from one video to another
+- Change the style of scenes while preserving depth
+## Getting Started
+### Model Selection
+1. Select either "Vace 1.3B" or "Vace 13B" from the dropdown menu
+2. Note: VACE works best with videos up to 7 seconds with the Riflex option enabled
+### Input Types
+VACE accepts three types of visual hints (which can be combined):
+#### 1. Control Video
+- Transfer motion or depth to a new video
+- Use only the first n frames and extrapolate the rest
+- Perform inpainting with grey color (127) as mask areas
+- Grey areas will be filled based on text prompt and reference images
+#### 2. Reference Images
+- Use as background/setting for the video
+- Inject people or objects of your choice
+- Select multiple reference images
+- **Tip**: Replace complex backgrounds with white for better object integration
+- Always describe injected objects/people explicitly in your text prompt
+#### 3. Video Mask
+- Stronger control over which parts to keep (black) or replace (white)
+- Perfect for inpainting/outpainting
+- Example: White mask except at beginning/end (black) keeps first/last frames while generating middle content
+## Common Use Cases
+### Motion Transfer
+**Goal**: Animate a character of your choice using motion from another video
+**Setup**:
+- Reference Images: Your character
+- Control Video: Person performing desired motion
+- Text Prompt: Describe your character and the action
+### Object/Person Injection
+**Goal**: Insert people or objects into a scene
+**Setup**:
+- Reference Images: The people/objects to inject
+- Text Prompt: Describe the scene and explicitly mention the injected elements
+### Character Animation
+**Goal**: Animate a character based on text description
+**Setup**:
+- Control Video: Video of person moving
+- Text Prompt: Detailed description of your character
+### Style Transfer with Depth
+**Goal**: Change scene style while preserving spatial relationships
+**Setup**:
+- Control Video: Original video (for depth information)
+- Text Prompt: New style description
+## Integrated Matanyone Tool
+WanGP includes the Matanyone tool, specifically tuned for VACE workflows. This helps create control videos and masks simultaneously.
+### Creating Face Replacement Masks
+1. Load your video in Matanyone
+2. Click on the face in the first frame
+3. Create a mask for the face
+4. Generate both control video and mask video with "Generate Video Matting"
+5. Export to VACE with "Export to current Video Input and Video Mask"
+6. Load replacement face image in Reference Images field
+### Advanced Matanyone Tips
+- **Negative Point Prompts**: Remove parts from current selection
+- **Sub Masks**: Create multiple independent masks, then combine them
+- **Background Masks**: Select everything except the character (useful for background replacement)
+- Enable/disable sub masks in Matanyone settings
+## Recommended Settings
+### Quality Settings
+- **Skip Layer Guidance**: Turn ON with default configuration for better results
+- **Long Prompts**: Use detailed descriptions, especially for background elements not in reference images
+- **Steps**: Use at least 15 steps for good quality, 30+ for best results
+### Sliding Window Settings
+For very long videos, configure sliding windows properly:
+- **Window Size**: Set appropriate duration for your content
+- **Overlap Frames**: Long enough for motion continuity, short enough to avoid blur propagation
+- **Discard Last Frames**: Remove at least 4 frames from each window (VACE 1.3B tends to blur final frames)
+### Background Removal
+VACE includes automatic background removal options:
+- Use for reference images containing people/objects
+- **Don't use** for landscape/setting reference images (first reference image)
+- Multiple background removal types available
+## Window Sliding for Long Videos
+Generate videos up to 1 minute by merging multiple windows:
+### How It Works
+- Each window uses corresponding time segment from control video
+- Example: 0-4s control video → first window, 4-8s → second window, etc.
+- Automatic overlap management ensures smooth transitions
+### Settings
+- **Window Size**: Duration of each generation window
+- **Overlap Frames**: Frames shared between windows for continuity
+- **Discard Last Frames**: Remove poor-quality ending frames
+- **Add Overlapped Noise**: Reduce quality degradation over time
+### Formula
+```
+Generated Frames = [Windows - 1] × [Window Size - Overlap - Discard] + Window Size
+```
+### Multi-Line Prompts (Experimental)
+- Each line of prompt used for different window
+- If more windows than prompt lines, last line repeats
+- Separate lines with carriage return
+## Advanced Features
+### Extend Video
+Click "Extend the Video Sample, Please!" during generation to add more windows dynamically.
+### Noise Addition
+Add noise to overlapped frames to hide accumulated errors and quality degradation.
+### Frame Truncation
+Automatically remove lower-quality final frames from each window (recommended: 4 frames for VACE 1.3B).
+## External Resources
+### Official VACE Resources
+- **GitHub**: https://github.com/ali-vilab/VACE/tree/main/vace/gradios
+- **User Guide**: https://github.com/ali-vilab/VACE/blob/main/UserGuide.md
+- **Preprocessors**: Gradio tools for preparing materials
+### Recommended External Tools
+- **Annotation Tools**: For creating precise masks
+- **Video Editors**: For preparing control videos
+- **Background Removal**: For cleaning reference images
+## Troubleshooting
+### Poor Quality Results
+1. Use longer, more detailed prompts
+2. Enable Skip Layer Guidance
+3. Increase number of steps (30+)
+4. Check reference image quality
+5. Ensure proper mask creation
+### Inconsistent Windows
+1. Increase overlap frames
+2. Use consistent prompting across windows
+3. Add noise to overlapped frames
+4. Reduce discard frames if losing too much content
+### Memory Issues
+1. Use VACE 1.3B instead of 13B
+2. Reduce video length or resolution
+3. Decrease window size
+4. Enable quantization
+### Blurry Results
+1. Reduce overlap frames
+2. Increase discard last frames
+3. Use higher resolution reference images
+4. Check control video quality
+## Tips for Best Results
+1. **Detailed Prompts**: Describe everything in the scene, especially elements not in reference images
+2. **Quality Reference Images**: Use high-resolution, well-lit reference images
+3. **Proper Masking**: Take time to create precise masks with Matanyone
+4. **Iterative Approach**: Start with short videos, then extend successful results
+5. **Background Preparation**: Remove complex backgrounds from object/person reference images
+6. **Consistent Lighting**: Match lighting between reference images and intended scene

fantasytalking/infer.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright Alibaba Inc. All Rights Reserved.
+from transformers import Wav2Vec2Model, Wav2Vec2Processor
+from .model import FantasyTalkingAudioConditionModel
+from .utils import get_audio_features
+import gc, torch
+def parse_audio(audio_path, num_frames, fps = 23, device = "cuda"):
+    fantasytalking = FantasyTalkingAudioConditionModel(None, 768, 2048).to(device)
+    from mmgp import offload
+    from accelerate import init_empty_weights
+    from fantasytalking.model import AudioProjModel
+    torch.set_grad_enabled(False)
+    with init_empty_weights():
+        proj_model = AudioProjModel( 768, 2048)
+    offload.load_model_data(proj_model, "ckpts/fantasy_proj_model.safetensors")
+    proj_model.to("cpu").eval().requires_grad_(False)
+    wav2vec_model_dir = "ckpts/wav2vec"
+    wav2vec_processor = Wav2Vec2Processor.from_pretrained(wav2vec_model_dir)
+    wav2vec = Wav2Vec2Model.from_pretrained(wav2vec_model_dir, device_map="cpu").eval().requires_grad_(False)
+    wav2vec.to(device)
+    proj_model.to(device)
+    audio_wav2vec_fea = get_audio_features( wav2vec, wav2vec_processor, audio_path, fps, num_frames )
+    audio_proj_fea = proj_model(audio_wav2vec_fea)
+    pos_idx_ranges = fantasytalking.split_audio_sequence( audio_proj_fea.size(1), num_frames=num_frames )
+    audio_proj_split, audio_context_lens = fantasytalking.split_tensor_with_padding( audio_proj_fea, pos_idx_ranges, expand_length=4 )  # [b,21,9+8,768]
+    wav2vec, proj_model= None, None
+    gc.collect()
+    torch.cuda.empty_cache()
+    return audio_proj_split, audio_context_lens

fantasytalking/model.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from wan.modules.attention import pay_attention
+class AudioProjModel(nn.Module):
+    def __init__(self, audio_in_dim=1024, cross_attention_dim=1024):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.proj = torch.nn.Linear(audio_in_dim, cross_attention_dim, bias=False)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, audio_embeds):
+        context_tokens = self.proj(audio_embeds)
+        context_tokens = self.norm(context_tokens)
+        return context_tokens  # [B,L,C]
+class WanCrossAttentionProcessor(nn.Module):
+    def __init__(self, context_dim, hidden_dim):
+        super().__init__()
+        self.context_dim = context_dim
+        self.hidden_dim = hidden_dim
+        self.k_proj = nn.Linear(context_dim, hidden_dim, bias=False)
+        self.v_proj = nn.Linear(context_dim, hidden_dim, bias=False)
+        nn.init.zeros_(self.k_proj.weight)
+        nn.init.zeros_(self.v_proj.weight)
+    def __call__(
+        self,
+        q: torch.Tensor,
+        audio_proj: torch.Tensor,
+        latents_num_frames: int = 21,
+        audio_context_lens = None
+    ) -> torch.Tensor:
+        """
+        audio_proj:   [B, 21, L3, C]
+        audio_context_lens: [B*21].
+        """
+        b, l, n, d = q.shape
+        if len(audio_proj.shape) == 4:
+            audio_q = q.view(b * latents_num_frames, -1, n, d)  # [b, 21, l1, n, d]
+            ip_key = self.k_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
+            ip_value = self.v_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
+            qkv_list = [audio_q, ip_key, ip_value]
+            del q, audio_q, ip_key, ip_value
+            audio_x = pay_attention(qkv_list, k_lens =audio_context_lens) #audio_context_lens
+            audio_x = audio_x.view(b, l, n, d)
+            audio_x = audio_x.flatten(2)
+        elif len(audio_proj.shape) == 3:
+            ip_key = self.k_proj(audio_proj).view(b, -1, n, d)
+            ip_value = self.v_proj(audio_proj).view(b, -1, n, d)
+            qkv_list = [q, ip_key, ip_value]
+            del q, ip_key, ip_value
+            audio_x = pay_attention(qkv_list, k_lens =audio_context_lens) #audio_context_lens
+            audio_x = audio_x.flatten(2)
+        return audio_x
+class FantasyTalkingAudioConditionModel(nn.Module):
+    def __init__(self, wan_dit, audio_in_dim: int, audio_proj_dim: int):
+        super().__init__()
+        self.audio_in_dim = audio_in_dim
+        self.audio_proj_dim = audio_proj_dim
+    def split_audio_sequence(self, audio_proj_length, num_frames=81):
+        """
+        Map the audio feature sequence to corresponding latent frame slices.
+        Args:
+            audio_proj_length (int): The total length of the audio feature sequence
+                                    (e.g., 173 in audio_proj[1, 173, 768]).
+            num_frames (int): The number of video frames in the training data (default: 81).
+        Returns:
+            list: A list of [start_idx, end_idx] pairs. Each pair represents the index range
+                (within the audio feature sequence) corresponding to a latent frame.
+        """
+        # Average number of tokens per original video frame
+        tokens_per_frame = audio_proj_length / num_frames
+        # Each latent frame covers 4 video frames, and we want the center
+        tokens_per_latent_frame = tokens_per_frame * 4
+        half_tokens = int(tokens_per_latent_frame / 2)
+        pos_indices = []
+        for i in range(int((num_frames - 1) / 4) + 1):
+            if i == 0:
+                pos_indices.append(0)
+            else:
+                start_token = tokens_per_frame * ((i - 1) * 4 + 1)
+                end_token = tokens_per_frame * (i * 4 + 1)
+                center_token = int((start_token + end_token) / 2) - 1
+                pos_indices.append(center_token)
+        # Build index ranges centered around each position
+        pos_idx_ranges = [[idx - half_tokens, idx + half_tokens] for idx in pos_indices]
+        # Adjust the first range to avoid negative start index
+        pos_idx_ranges[0] = [
+            -(half_tokens * 2 - pos_idx_ranges[1][0]),
+            pos_idx_ranges[1][0],
+        ]
+        return pos_idx_ranges
+    def split_tensor_with_padding(self, input_tensor, pos_idx_ranges, expand_length=0):
+        """
+        Split the input tensor into subsequences based on index ranges, and apply right-side zero-padding
+        if the range exceeds the input boundaries.
+        Args:
+            input_tensor (Tensor): Input audio tensor of shape [1, L, 768].
+            pos_idx_ranges (list): A list of index ranges, e.g. [[-7, 1], [1, 9], ..., [165, 173]].
+            expand_length (int): Number of tokens to expand on both sides of each subsequence.
+        Returns:
+            sub_sequences (Tensor): A tensor of shape [1, F, L, 768], where L is the length after padding.
+                                    Each element is a padded subsequence.
+            k_lens (Tensor): A tensor of shape [F], representing the actual (unpadded) length of each subsequence.
+                            Useful for ignoring padding tokens in attention masks.
+        """
+        pos_idx_ranges = [
+            [idx[0] - expand_length, idx[1] + expand_length] for idx in pos_idx_ranges
+        ]
+        sub_sequences = []
+        seq_len = input_tensor.size(1)  # 173
+        max_valid_idx = seq_len - 1  # 172
+        k_lens_list = []
+        for start, end in pos_idx_ranges:
+            # Calculate the fill amount
+            pad_front = max(-start, 0)
+            pad_back = max(end - max_valid_idx, 0)
+            # Calculate the start and end indices of the valid part
+            valid_start = max(start, 0)
+            valid_end = min(end, max_valid_idx)
+            # Extract the valid part
+            if valid_start <= valid_end:
+                valid_part = input_tensor[:, valid_start : valid_end + 1, :]
+            else:
+                valid_part = input_tensor.new_zeros((1, 0, input_tensor.size(2)))
+            # In the sequence dimension (the 1st dimension) perform padding
+            padded_subseq = F.pad(
+                valid_part,
+                (0, 0, 0, pad_back + pad_front, 0, 0),
+                mode="constant",
+                value=0,
+            )
+            k_lens_list.append(padded_subseq.size(-2) - pad_back - pad_front)
+            sub_sequences.append(padded_subseq)
+        return torch.stack(sub_sequences, dim=1), torch.tensor(
+            k_lens_list, dtype=torch.long
+        )

fantasytalking/utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright Alibaba Inc. All Rights Reserved.
+import imageio
+import librosa
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+def resize_image_by_longest_edge(image_path, target_size):
+    image = Image.open(image_path).convert("RGB")
+    width, height = image.size
+    scale = target_size / max(width, height)
+    new_size = (int(width * scale), int(height * scale))
+    return image.resize(new_size, Image.LANCZOS)
+def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
+    writer = imageio.get_writer(
+        save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params
+    )
+    for frame in tqdm(frames, desc="Saving video"):
+        frame = np.array(frame)
+        writer.append_data(frame)
+    writer.close()
+def get_audio_features(wav2vec, audio_processor, audio_path, fps, num_frames):
+    sr = 16000
+    audio_input, sample_rate = librosa.load(audio_path, sr=sr)  # 采样率为 16kHz
+    start_time = 0
+    # end_time = (0 + (num_frames - 1) * 1) / fps
+    end_time = num_frames / fps
+    start_sample = int(start_time * sr)
+    end_sample = int(end_time * sr)
+    try:
+        audio_segment = audio_input[start_sample:end_sample]
+    except:
+        audio_segment = audio_input
+    input_values = audio_processor(
+        audio_segment, sampling_rate=sample_rate, return_tensors="pt"
+    ).input_values.to("cuda")
+    with torch.no_grad():
+        fea = wav2vec(input_values).last_hidden_state
+    return fea

hyvideo/__init__.py ADDED Viewed

File without changes

hyvideo/config.py ADDED Viewed

	@@ -0,0 +1,534 @@

+import argparse
+from .constants import *
+import re
+from .modules.models import HUNYUAN_VIDEO_CONFIG
+def parse_args(namespace=None):
+    parser = argparse.ArgumentParser(description="HunyuanVideo inference script")
+    parser = add_network_args(parser)
+    parser = add_extra_models_args(parser)
+    parser = add_denoise_schedule_args(parser)
+    parser = add_inference_args(parser)
+    parser = add_parallel_args(parser)
+    args = parser.parse_args(namespace=namespace)
+    args = sanity_check_args(args)
+    return args
+def add_network_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(title="HunyuanVideo network args")
+    group.add_argument(
+        "--quantize-transformer",
+        action="store_true",
+        help="On the fly 'transformer' quantization"
+    )
+    group.add_argument(
+        "--lora-dir-i2v",
+        type=str,
+        default="loras_i2v",
+        help="Path to a directory that contains Loras for i2v"
+    )
+    group.add_argument(
+        "--lora-dir",
+        type=str,
+        default="",
+        help="Path to a directory that contains Loras"
+    )
+    group.add_argument(
+        "--lora-preset",
+        type=str,
+        default="",
+        help="Lora preset to preload"
+    )
+    # group.add_argument(
+    #     "--lora-preset-i2v",
+    #     type=str,
+    #     default="",
+    #     help="Lora preset to preload for i2v"
+    # )
+    group.add_argument(
+        "--profile",
+        type=str,
+        default=-1,
+        help="Profile No"
+    )
+    group.add_argument(
+        "--verbose",
+        type=str,
+        default=1,
+        help="Verbose level"
+    )
+    group.add_argument(
+        "--server-port",
+        type=str,
+        default=0,
+        help="Server port"
+    )
+    group.add_argument(
+        "--server-name",
+        type=str,
+        default="",
+        help="Server name"
+    )
+    group.add_argument(
+        "--open-browser",
+        action="store_true",
+        help="open browser"
+    )
+    group.add_argument(
+        "--t2v",
+        action="store_true",
+        help="text to video mode"
+    )
+    group.add_argument(
+        "--i2v",
+        action="store_true",
+        help="image to video mode"
+    )
+    group.add_argument(
+        "--compile",
+        action="store_true",
+        help="Enable pytorch compilation"
+    )
+    group.add_argument(
+        "--fast",
+        action="store_true",
+        help="use Fast HunyuanVideo model"
+    )
+    group.add_argument(
+        "--fastest",
+        action="store_true",
+        help="activate the best config"
+    )
+    group.add_argument(
+    "--attention",
+    type=str,
+    default="",
+    help="attention mode"
+    )
+    group.add_argument(
+    "--vae-config",
+    type=str,
+    default="",
+    help="vae config mode"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a shared URL to access webserver remotely"
+    )
+    parser.add_argument(
+    "--lock-config",
+    action="store_true",
+    help="Prevent modifying the configuration from the web interface"
+    )
+    parser.add_argument(
+        "--preload",
+        type=str,
+        default="0",
+        help="Megabytes of the diffusion model to preload in VRAM"
+    )
+    parser.add_argument(
+        "--multiple-images",
+        action="store_true",
+        help="Allow inputting multiple images with image to video"
+    )
+    # Main model
+    group.add_argument(
+        "--model",
+        type=str,
+        choices=list(HUNYUAN_VIDEO_CONFIG.keys()),
+        default="HYVideo-T/2-cfgdistill",
+    )
+    group.add_argument(
+        "--latent-channels",
+        type=str,
+        default=16,
+        help="Number of latent channels of DiT. If None, it will be determined by `vae`. If provided, "
+        "it still needs to match the latent channels of the VAE model.",
+    )
+    group.add_argument(
+        "--precision",
+        type=str,
+        default="bf16",
+        choices=PRECISIONS,
+        help="Precision mode. Options: fp32, fp16, bf16. Applied to the backbone model and optimizer.",
+    )
+    # RoPE
+    group.add_argument(
+        "--rope-theta", type=int, default=256, help="Theta used in RoPE."
+    )
+    return parser
+def add_extra_models_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(
+        title="Extra models args, including vae, text encoders and tokenizers)"
+    )
+    # - VAE
+    group.add_argument(
+        "--vae",
+        type=str,
+        default="884-16c-hy",
+        choices=list(VAE_PATH),
+        help="Name of the VAE model.",
+    )
+    group.add_argument(
+        "--vae-precision",
+        type=str,
+        default="fp16",
+        choices=PRECISIONS,
+        help="Precision mode for the VAE model.",
+    )
+    group.add_argument(
+        "--vae-tiling",
+        action="store_true",
+        help="Enable tiling for the VAE model to save GPU memory.",
+    )
+    group.set_defaults(vae_tiling=True)
+    group.add_argument(
+        "--text-encoder",
+        type=str,
+        default="llm",
+        choices=list(TEXT_ENCODER_PATH),
+        help="Name of the text encoder model.",
+    )
+    group.add_argument(
+        "--text-encoder-precision",
+        type=str,
+        default="fp16",
+        choices=PRECISIONS,
+        help="Precision mode for the text encoder model.",
+    )
+    group.add_argument(
+        "--text-states-dim",
+        type=int,
+        default=4096,
+        help="Dimension of the text encoder hidden states.",
+    )
+    group.add_argument(
+        "--text-len", type=int, default=256, help="Maximum length of the text input."
+    )
+    group.add_argument(
+        "--tokenizer",
+        type=str,
+        default="llm",
+        choices=list(TOKENIZER_PATH),
+        help="Name of the tokenizer model.",
+    )
+    group.add_argument(
+        "--prompt-template",
+        type=str,
+        default="dit-llm-encode",
+        choices=PROMPT_TEMPLATE,
+        help="Image prompt template for the decoder-only text encoder model.",
+    )
+    group.add_argument(
+        "--prompt-template-video",
+        type=str,
+        default="dit-llm-encode-video",
+        choices=PROMPT_TEMPLATE,
+        help="Video prompt template for the decoder-only text encoder model.",
+    )
+    group.add_argument(
+        "--hidden-state-skip-layer",
+        type=int,
+        default=2,
+        help="Skip layer for hidden states.",
+    )
+    group.add_argument(
+        "--apply-final-norm",
+        action="store_true",
+        help="Apply final normalization to the used text encoder hidden states.",
+    )
+    # - CLIP
+    group.add_argument(
+        "--text-encoder-2",
+        type=str,
+        default="clipL",
+        choices=list(TEXT_ENCODER_PATH),
+        help="Name of the second text encoder model.",
+    )
+    group.add_argument(
+        "--text-encoder-precision-2",
+        type=str,
+        default="fp16",
+        choices=PRECISIONS,
+        help="Precision mode for the second text encoder model.",
+    )
+    group.add_argument(
+        "--text-states-dim-2",
+        type=int,
+        default=768,
+        help="Dimension of the second text encoder hidden states.",
+    )
+    group.add_argument(
+        "--tokenizer-2",
+        type=str,
+        default="clipL",
+        choices=list(TOKENIZER_PATH),
+        help="Name of the second tokenizer model.",
+    )
+    group.add_argument(
+        "--text-len-2",
+        type=int,
+        default=77,
+        help="Maximum length of the second text input.",
+    )
+    return parser
+def add_denoise_schedule_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(title="Denoise schedule args")
+    group.add_argument(
+        "--denoise-type",
+        type=str,
+        default="flow",
+        help="Denoise type for noised inputs.",
+    )
+    # Flow Matching
+    group.add_argument(
+        "--flow-shift",
+        type=float,
+        default=7.0,
+        help="Shift factor for flow matching schedulers.",
+    )
+    group.add_argument(
+        "--flow-reverse",
+        action="store_true",
+        help="If reverse, learning/sampling from t=1 -> t=0.",
+    )
+    group.add_argument(
+        "--flow-solver",
+        type=str,
+        default="euler",
+        help="Solver for flow matching.",
+    )
+    group.add_argument(
+        "--use-linear-quadratic-schedule",
+        action="store_true",
+        help="Use linear quadratic schedule for flow matching."
+        "Following MovieGen (https://ai.meta.com/static-resource/movie-gen-research-paper)",
+    )
+    group.add_argument(
+        "--linear-schedule-end",
+        type=int,
+        default=25,
+        help="End step for linear quadratic schedule for flow matching.",
+    )
+    return parser
+def add_inference_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(title="Inference args")
+    # ======================== Model loads ========================
+    group.add_argument(
+        "--model-base",
+        type=str,
+        default="ckpts",
+        help="Root path of all the models, including t2v models and extra models.",
+    )
+    group.add_argument(
+        "--dit-weight",
+        type=str,
+        default="ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt",
+        help="Path to the HunyuanVideo model. If None, search the model in the args.model_root."
+        "1. If it is a file, load the model directly."
+        "2. If it is a directory, search the model in the directory. Support two types of models: "
+        "1) named `pytorch_model_*.pt`"
+        "2) named `*_model_states.pt`, where * can be `mp_rank_00`.",
+    )
+    group.add_argument(
+        "--model-resolution",
+        type=str,
+        default="540p",
+        choices=["540p", "720p"],
+        help="Root path of all the models, including t2v models and extra models.",
+    )
+    group.add_argument(
+        "--load-key",
+        type=str,
+        default="module",
+        help="Key to load the model states. 'module' for the main model, 'ema' for the EMA model.",
+    )
+    group.add_argument(
+        "--use-cpu-offload",
+        action="store_true",
+        help="Use CPU offload for the model load.",
+    )
+    # ======================== Inference general setting ========================
+    group.add_argument(
+        "--batch-size",
+        type=int,
+        default=1,
+        help="Batch size for inference and evaluation.",
+    )
+    group.add_argument(
+        "--infer-steps",
+        type=int,
+        default=50,
+        help="Number of denoising steps for inference.",
+    )
+    group.add_argument(
+        "--disable-autocast",
+        action="store_true",
+        help="Disable autocast for denoising loop and vae decoding in pipeline sampling.",
+    )
+    group.add_argument(
+        "--save-path",
+        type=str,
+        default="./results",
+        help="Path to save the generated samples.",
+    )
+    group.add_argument(
+        "--save-path-suffix",
+        type=str,
+        default="",
+        help="Suffix for the directory of saved samples.",
+    )
+    group.add_argument(
+        "--name-suffix",
+        type=str,
+        default="",
+        help="Suffix for the names of saved samples.",
+    )
+    group.add_argument(
+        "--num-videos",
+        type=int,
+        default=1,
+        help="Number of videos to generate for each prompt.",
+    )
+    # ---sample size---
+    group.add_argument(
+        "--video-size",
+        type=int,
+        nargs="+",
+        default=(720, 1280),
+        help="Video size for training. If a single value is provided, it will be used for both height "
+        "and width. If two values are provided, they will be used for height and width "
+        "respectively.",
+    )
+    group.add_argument(
+        "--video-length",
+        type=int,
+        default=129,
+        help="How many frames to sample from a video. if using 3d vae, the number should be 4n+1",
+    )
+    # --- prompt ---
+    group.add_argument(
+        "--prompt",
+        type=str,
+        default=None,
+        help="Prompt for sampling during evaluation.",
+    )
+    group.add_argument(
+        "--seed-type",
+        type=str,
+        default="auto",
+        choices=["file", "random", "fixed", "auto"],
+        help="Seed type for evaluation. If file, use the seed from the CSV file. If random, generate a "
+        "random seed. If fixed, use the fixed seed given by `--seed`. If auto, `csv` will use the "
+        "seed column if available, otherwise use the fixed `seed` value. `prompt` will use the "
+        "fixed `seed` value.",
+    )
+    group.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+    # Classifier-Free Guidance
+    group.add_argument(
+        "--neg-prompt", type=str, default=None, help="Negative prompt for sampling."
+    )
+    group.add_argument(
+        "--cfg-scale", type=float, default=1.0, help="Classifier free guidance scale."
+    )
+    group.add_argument(
+        "--embedded-cfg-scale",
+        type=float,
+        default=6.0,
+        help="Embeded classifier free guidance scale.",
+    )
+    group.add_argument(
+        "--reproduce",
+        action="store_true",
+        help="Enable reproducibility by setting random seeds and deterministic algorithms.",
+    )
+    return parser
+def add_parallel_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(title="Parallel args")
+    # ======================== Model loads ========================
+    group.add_argument(
+        "--ulysses-degree",
+        type=int,
+        default=1,
+        help="Ulysses degree.",
+    )
+    group.add_argument(
+        "--ring-degree",
+        type=int,
+        default=1,
+        help="Ulysses degree.",
+    )
+    return parser
+def sanity_check_args(args):
+    # VAE channels
+    vae_pattern = r"\d{2,3}-\d{1,2}c-\w+"
+    if not re.match(vae_pattern, args.vae):
+        raise ValueError(
+            f"Invalid VAE model: {args.vae}. Must be in the format of '{vae_pattern}'."
+        )
+    vae_channels = int(args.vae.split("-")[1][:-1])
+    if args.latent_channels is None:
+        args.latent_channels = vae_channels
+    if vae_channels != args.latent_channels:
+        raise ValueError(
+            f"Latent channels ({args.latent_channels}) must match the VAE channels ({vae_channels})."
+        )
+    return args

hyvideo/constants.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import torch
+__all__ = [
+    "C_SCALE",
+    "PROMPT_TEMPLATE",
+    "MODEL_BASE",
+    "PRECISIONS",
+    "NORMALIZATION_TYPE",
+    "ACTIVATION_TYPE",
+    "VAE_PATH",
+    "TEXT_ENCODER_PATH",
+    "TOKENIZER_PATH",
+    "TEXT_PROJECTION",
+    "DATA_TYPE",
+    "NEGATIVE_PROMPT",
+    "NEGATIVE_PROMPT_I2V",
+    "FLOW_PATH_TYPE",
+    "FLOW_PREDICT_TYPE",
+    "FLOW_LOSS_WEIGHT",
+    "FLOW_SNR_TYPE",
+    "FLOW_SOLVER",
+]
+PRECISION_TO_TYPE = {
+    'fp32': torch.float32,
+    'fp16': torch.float16,
+    'bf16': torch.bfloat16,
+}
+# =================== Constant Values =====================
+# Computation scale factor, 1P = 1_000_000_000_000_000. Tensorboard will display the value in PetaFLOPS to avoid
+# overflow error when tensorboard logging values.
+C_SCALE = 1_000_000_000_000_000
+# When using decoder-only models, we must provide a prompt template to instruct the text encoder
+# on how to generate the text.
+# --------------------------------------------------------------------
+PROMPT_TEMPLATE_ENCODE = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
+    "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+PROMPT_TEMPLATE_ENCODE_VIDEO = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
+    "1. The main content and theme of the video."
+    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+    "4. background environment, light, style and atmosphere."
+    "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+PROMPT_TEMPLATE_ENCODE_I2V = (
+    "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the image by detailing the color, shape, size, texture, "
+    "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"
+)
+PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
+    "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
+    "1. The main content and theme of the video."
+    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+    "4. background environment, light, style and atmosphere."
+    "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"
+)
+NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
+NEGATIVE_PROMPT_I2V = "deformation, a poor composition and deformed video, bad teeth, bad eyes, bad limbs"
+PROMPT_TEMPLATE = {
+    "dit-llm-encode": {
+        "template": PROMPT_TEMPLATE_ENCODE,
+        "crop_start": 36,
+    },
+    "dit-llm-encode-video": {
+        "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
+        "crop_start": 95,
+    },
+    "dit-llm-encode-i2v": {
+        "template": PROMPT_TEMPLATE_ENCODE_I2V,
+        "crop_start": 36,
+        "image_emb_start": 5,
+        "image_emb_end": 581,
+        "image_emb_len": 576,
+        "double_return_token_id": 271
+    },
+    "dit-llm-encode-video-i2v": {
+        "template": PROMPT_TEMPLATE_ENCODE_VIDEO_I2V,
+        "crop_start": 103,
+        "image_emb_start": 5,
+        "image_emb_end": 581,
+        "image_emb_len": 576,
+        "double_return_token_id": 271
+    },
+}
+# ======================= Model ======================
+PRECISIONS = {"fp32", "fp16", "bf16"}
+NORMALIZATION_TYPE = {"layer", "rms"}
+ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
+# =================== Model Path =====================
+MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
+# =================== Data =======================
+DATA_TYPE = {"image", "video", "image_video"}
+# 3D VAE
+VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
+# Text Encoder
+TEXT_ENCODER_PATH = {
+    "clipL": f"{MODEL_BASE}/clip_vit_large_patch14",
+    "llm": f"{MODEL_BASE}/llava-llama-3-8b",
+    "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b",
+}
+# Tokenizer
+TOKENIZER_PATH = {
+    "clipL": f"{MODEL_BASE}/clip_vit_large_patch14",
+    "llm": f"{MODEL_BASE}/llava-llama-3-8b",
+    "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b",
+}
+TEXT_PROJECTION = {
+    "linear",  # Default, an nn.Linear() layer
+    "single_refiner",  # Single TokenRefiner. Refer to LI-DiT
+}
+# Flow Matching path type
+FLOW_PATH_TYPE = {
+    "linear",               # Linear trajectory between noise and data
+    "gvp",                  # Generalized variance-preserving SDE
+    "vp",                   # Variance-preserving SDE
+}
+# Flow Matching predict type
+FLOW_PREDICT_TYPE = {
+    "velocity",             # Predict velocity
+    "score",                # Predict score
+    "noise",                # Predict noise
+}
+# Flow Matching loss weight
+FLOW_LOSS_WEIGHT = {
+    "velocity",             # Weight loss by velocity
+    "likelihood",           # Weight loss by likelihood
+}
+# Flow Matching SNR type
+FLOW_SNR_TYPE = {
+    "lognorm",              # Log-normal SNR
+    "uniform",              # Uniform SNR
+}
+# Flow Matching solvers
+FLOW_SOLVER = {
+    "euler",                # Euler solver
+}

hyvideo/data_kits/audio_dataset.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import cv2
+import math
+import json
+import torch
+import random
+import librosa
+import traceback
+import torchvision
+import numpy as np
+import pandas as pd
+from PIL import Image
+from einops import rearrange
+from torch.utils.data import Dataset
+from decord import VideoReader, cpu
+from transformers import CLIPImageProcessor
+import torchvision.transforms as transforms
+from torchvision.transforms import ToPILImage
+def get_audio_feature(feature_extractor, audio_path):
+    audio_input, sampling_rate = librosa.load(audio_path, sr=16000)
+    assert sampling_rate == 16000
+    audio_features = []
+    window = 750*640
+    for i in range(0, len(audio_input), window):
+        audio_feature = feature_extractor(audio_input[i:i+window],
+                                        sampling_rate=sampling_rate,
+                                        return_tensors="pt",
+                                        ).input_features
+        audio_features.append(audio_feature)
+    audio_features = torch.cat(audio_features, dim=-1)
+    return audio_features, len(audio_input) // 640
+class VideoAudioTextLoaderVal(Dataset):
+    def __init__(
+        self,
+        image_size: int,
+        meta_file: str,
+        **kwargs,
+    ):
+        super().__init__()
+        self.meta_file = meta_file
+        self.image_size = image_size
+        self.text_encoder = kwargs.get("text_encoder", None)            # llava_text_encoder
+        self.text_encoder_2 = kwargs.get("text_encoder_2", None)        # clipL_text_encoder
+        self.feature_extractor = kwargs.get("feature_extractor", None)
+        self.meta_files = []
+        csv_data = pd.read_csv(meta_file)
+        for idx in range(len(csv_data)):
+            self.meta_files.append(
+                {
+                    "videoid": str(csv_data["videoid"][idx]),
+                    "image_path": str(csv_data["image"][idx]),
+                    "audio_path": str(csv_data["audio"][idx]),
+                    "prompt": str(csv_data["prompt"][idx]),
+                    "fps": float(csv_data["fps"][idx])
+                }
+            )
+        self.llava_transform = transforms.Compose(
+            [
+                transforms.Resize((336, 336), interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.4082107), (0.26862954, 0.26130258, 0.27577711)),
+            ]
+        )
+        self.clip_image_processor = CLIPImageProcessor()
+        self.device = torch.device("cuda")
+        self.weight_dtype = torch.float16
+    def __len__(self):
+        return len(self.meta_files)
+    @staticmethod
+    def get_text_tokens(text_encoder, description, dtype_encode="video"):
+        text_inputs = text_encoder.text2tokens(description, data_type=dtype_encode)
+        text_ids = text_inputs["input_ids"].squeeze(0)
+        text_mask = text_inputs["attention_mask"].squeeze(0)
+        return text_ids, text_mask
+    def get_batch_data(self, idx):
+        meta_file = self.meta_files[idx]
+        videoid = meta_file["videoid"]
+        image_path = meta_file["image_path"]
+        audio_path = meta_file["audio_path"]
+        prompt = "Authentic, Realistic, Natural, High-quality, Lens-Fixed, " + meta_file["prompt"]
+        fps = meta_file["fps"]
+        img_size = self.image_size
+        ref_image = Image.open(image_path).convert('RGB')
+        # Resize reference image
+        w, h = ref_image.size
+        scale = img_size / min(w, h)
+        new_w = round(w * scale / 64) * 64
+        new_h = round(h * scale / 64) * 64
+        if img_size == 704:
+            img_size_long = 1216
+        if new_w * new_h > img_size * img_size_long:
+            import math
+            scale = math.sqrt(img_size * img_size_long / w / h)
+            new_w = round(w * scale / 64) * 64
+            new_h = round(h * scale / 64) * 64
+        ref_image = ref_image.resize((new_w, new_h), Image.LANCZOS)
+        ref_image = np.array(ref_image)
+        ref_image = torch.from_numpy(ref_image)
+        audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_path)
+        audio_prompts = audio_input[0]
+        motion_bucket_id_heads = np.array([25] * 4)
+        motion_bucket_id_exps = np.array([30] * 4)
+        motion_bucket_id_heads = torch.from_numpy(motion_bucket_id_heads)
+        motion_bucket_id_exps = torch.from_numpy(motion_bucket_id_exps)
+        fps = torch.from_numpy(np.array(fps))
+        to_pil = ToPILImage()
+        pixel_value_ref = rearrange(ref_image.clone().unsqueeze(0), "b h w c -> b c h w")   # (b c h w)
+        pixel_value_ref_llava = [self.llava_transform(to_pil(image)) for image in pixel_value_ref]
+        pixel_value_ref_llava = torch.stack(pixel_value_ref_llava, dim=0)
+        pixel_value_ref_clip = self.clip_image_processor(
+            images=Image.fromarray((pixel_value_ref[0].permute(1,2,0)).data.cpu().numpy().astype(np.uint8)),
+            return_tensors="pt"
+        ).pixel_values[0]
+        pixel_value_ref_clip = pixel_value_ref_clip.unsqueeze(0)
+        # Encode text prompts
+        text_ids, text_mask = self.get_text_tokens(self.text_encoder, prompt)
+        text_ids_2, text_mask_2 = self.get_text_tokens(self.text_encoder_2, prompt)
+        # Output batch
+        batch = {
+            "text_prompt": prompt,                         #
+            "videoid": videoid,
+            "pixel_value_ref": pixel_value_ref.to(dtype=torch.float16),                 # 参考图，用于vae提特征 (1, 3, h, w), 取值范围(0, 255)
+            "pixel_value_ref_llava": pixel_value_ref_llava.to(dtype=torch.float16),     # 参考图，用于llava提特征 (1, 3, 336, 336), 取值范围 = CLIP取值范围
+            "pixel_value_ref_clip": pixel_value_ref_clip.to(dtype=torch.float16),       # 参考图，用于clip_image_encoder提特征 (1, 3, 244, 244), 取值范围 = CLIP取值范围
+            "audio_prompts": audio_prompts.to(dtype=torch.float16),
+            "motion_bucket_id_heads": motion_bucket_id_heads.to(dtype=text_ids.dtype),
+            "motion_bucket_id_exps": motion_bucket_id_exps.to(dtype=text_ids.dtype),
+            "fps": fps.to(dtype=torch.float16),
+            "text_ids": text_ids.clone(),                                               # 对应llava_text_encoder
+            "text_mask": text_mask.clone(),                                             # 对应llava_text_encoder
+            "text_ids_2": text_ids_2.clone(),                                           # 对应clip_text_encoder
+            "text_mask_2": text_mask_2.clone(),                                         # 对应clip_text_encoder
+            "audio_len": audio_len,
+            "image_path": image_path,
+            "audio_path": audio_path,
+        }
+        return batch
+    def __getitem__(self, idx):
+        return self.get_batch_data(idx)

hyvideo/data_kits/audio_preprocessor.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import cv2
+import json
+import time
+import decord
+import einops
+import librosa
+import torch
+import random
+import argparse
+import traceback
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+from einops import rearrange
+def get_facemask(ref_image, align_instance, area=1.25):
+    # ref_image: (b f c h w)
+    bsz, f, c, h, w = ref_image.shape
+    images = rearrange(ref_image, "b f c h w -> (b f) h w c").data.cpu().numpy().astype(np.uint8)
+    face_masks = []
+    for image in images:
+        image_pil = Image.fromarray(image).convert("RGB")
+        _, _, bboxes_list = align_instance(np.array(image_pil)[:,:,[2,1,0]], maxface=True)
+        try:
+            bboxSrc = bboxes_list[0]
+        except:
+            bboxSrc = [0, 0, w, h]
+        x1, y1, ww, hh = bboxSrc
+        x2, y2 = x1 + ww, y1 + hh
+        ww, hh = (x2-x1) * area, (y2-y1) * area
+        center = [(x2+x1)//2, (y2+y1)//2]
+        x1 = max(center[0] - ww//2, 0)
+        y1 = max(center[1] - hh//2, 0)
+        x2 = min(center[0] + ww//2, w)
+        y2 = min(center[1] + hh//2, h)
+        face_mask = np.zeros_like(np.array(image_pil))
+        face_mask[int(y1):int(y2), int(x1):int(x2)] = 1.0
+        face_masks.append(torch.from_numpy(face_mask[...,:1]))
+    face_masks = torch.stack(face_masks, dim=0)     # (b*f, h, w, c)
+    face_masks = rearrange(face_masks, "(b f) h w c -> b c f h w", b=bsz, f=f)
+    face_masks = face_masks.to(device=ref_image.device, dtype=ref_image.dtype)
+    return face_masks
+def encode_audio(wav2vec, audio_feats, fps, num_frames=129):
+    if fps == 25:
+        start_ts = [0]
+        step_ts = [1]
+    elif fps == 12.5:
+        start_ts = [0]
+        step_ts = [2]
+    else:
+        start_ts = [0]
+        step_ts = [1]
+    num_frames = min(num_frames, 400)
+    audio_feats = wav2vec.encoder(audio_feats.unsqueeze(0)[:, :, :3000], output_hidden_states=True).hidden_states
+    audio_feats = torch.stack(audio_feats, dim=2)
+    audio_feats = torch.cat([torch.zeros_like(audio_feats[:,:4]), audio_feats], 1)
+    audio_prompts = []
+    for bb in range(1):
+        audio_feats_list = []
+        for f in range(num_frames):
+            cur_t = (start_ts[bb] + f * step_ts[bb]) * 2
+            audio_clip = audio_feats[bb:bb+1, cur_t: cur_t+10]
+            audio_feats_list.append(audio_clip)
+        audio_feats_list = torch.stack(audio_feats_list, 1)
+        audio_prompts.append(audio_feats_list)
+    audio_prompts = torch.cat(audio_prompts)
+    return audio_prompts

hyvideo/data_kits/data_tools.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import cv2
+import torch
+import numpy as np
+import imageio
+import torchvision
+from einops import rearrange
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8, quality=8):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = torch.clamp(x,0,1)
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    imageio.mimsave(path, outputs, fps=fps, quality=quality)
+def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
+    crop_h, crop_w = crop_img.shape[:2]
+    target_w, target_h = size
+    scale_h, scale_w = target_h / crop_h, target_w / crop_w
+    if scale_w > scale_h:
+        resize_h = int(target_h*resize_ratio)
+        resize_w = int(crop_w / crop_h * resize_h)
+    else:
+        resize_w = int(target_w*resize_ratio)
+        resize_h = int(crop_h / crop_w * resize_w)
+    crop_img = cv2.resize(crop_img, (resize_w, resize_h))
+    pad_left = (target_w - resize_w) // 2
+    pad_top = (target_h - resize_h) // 2
+    pad_right = target_w - resize_w - pad_left
+    pad_bottom = target_h - resize_h - pad_top
+    crop_img = cv2.copyMakeBorder(crop_img, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=color)
+    return crop_img

hyvideo/data_kits/face_align/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .align import AlignImage

hyvideo/data_kits/face_align/align.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+import sys
+import torch
+from .detface import DetFace
+class AlignImage(object):
+    def __init__(self, device='cuda', det_path=''):
+        self.facedet = DetFace(pt_path=det_path, confThreshold=0.5, nmsThreshold=0.45, device=device)
+    @torch.no_grad()
+    def __call__(self, im, maxface=False):
+        bboxes, kpss, scores = self.facedet.detect(im)
+        face_num = bboxes.shape[0]
+        five_pts_list = []
+        scores_list = []
+        bboxes_list = []
+        for i in range(face_num):
+            five_pts_list.append(kpss[i].reshape(5,2))
+            scores_list.append(scores[i])
+            bboxes_list.append(bboxes[i])
+        if maxface and face_num>1:
+            max_idx = 0
+            max_area = (bboxes[0, 2])*(bboxes[0, 3])
+            for i in range(1, face_num):
+                area = (bboxes[i,2])*(bboxes[i,3])
+                if area>max_area:
+                    max_idx = i
+            five_pts_list = [five_pts_list[max_idx]]
+            scores_list = [scores_list[max_idx]]
+            bboxes_list = [bboxes_list[max_idx]]
+        return five_pts_list, scores_list, bboxes_list

hyvideo/data_kits/face_align/detface.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# -*- coding: UTF-8 -*-
+import os
+import cv2
+import numpy as np
+import torch
+import torchvision
+def xyxy2xywh(x):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
+    y[:, 2] = x[:, 2] - x[:, 0]  # width
+    y[:, 3] = x[:, 3] - x[:, 1]  # height
+    return y
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+def box_iou(box1, box2):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    def box_area(box):
+        # box = 4xn
+        return (box[2] - box[0]) * (box[3] - box[1])
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) -
+             torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
+    # iou = inter / (area1 + area2 - inter)
+    return inter / (area1[:, None] + area2 - inter)
+def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+    coords[:, [0, 2]] -= pad[0]  # x padding
+    coords[:, [1, 3]] -= pad[1]  # y padding
+    coords[:, :4] /= gain
+    clip_coords(coords, img0_shape)
+    return coords
+def clip_coords(boxes, img_shape):
+    # Clip bounding xyxy bounding boxes to image shape (height, width)
+    boxes[:, 0].clamp_(0, img_shape[1])  # x1
+    boxes[:, 1].clamp_(0, img_shape[0])  # y1
+    boxes[:, 2].clamp_(0, img_shape[1])  # x2
+    boxes[:, 3].clamp_(0, img_shape[0])  # y2
+def scale_coords_landmarks(img1_shape, coords, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+    coords[:, [0, 2, 4, 6, 8]] -= pad[0]  # x padding
+    coords[:, [1, 3, 5, 7, 9]] -= pad[1]  # y padding
+    coords[:, :10] /= gain
+    #clip_coords(coords, img0_shape)
+    coords[:, 0].clamp_(0, img0_shape[1])  # x1
+    coords[:, 1].clamp_(0, img0_shape[0])  # y1
+    coords[:, 2].clamp_(0, img0_shape[1])  # x2
+    coords[:, 3].clamp_(0, img0_shape[0])  # y2
+    coords[:, 4].clamp_(0, img0_shape[1])  # x3
+    coords[:, 5].clamp_(0, img0_shape[0])  # y3
+    coords[:, 6].clamp_(0, img0_shape[1])  # x4
+    coords[:, 7].clamp_(0, img0_shape[0])  # y4
+    coords[:, 8].clamp_(0, img0_shape[1])  # x5
+    coords[:, 9].clamp_(0, img0_shape[0])  # y5
+    return coords
+def show_results(img, xywh, conf, landmarks, class_num):
+    h,w,c = img.shape
+    tl = 1 or round(0.002 * (h + w) / 2) + 1  # line/font thickness
+    x1 = int(xywh[0] * w - 0.5 * xywh[2] * w)
+    y1 = int(xywh[1] * h - 0.5 * xywh[3] * h)
+    x2 = int(xywh[0] * w + 0.5 * xywh[2] * w)
+    y2 = int(xywh[1] * h + 0.5 * xywh[3] * h)
+    cv2.rectangle(img, (x1,y1), (x2, y2), (0,255,0), thickness=tl, lineType=cv2.LINE_AA)
+    clors = [(255,0,0),(0,255,0),(0,0,255),(255,255,0),(0,255,255)]
+    for i in range(5):
+        point_x = int(landmarks[2 * i] * w)
+        point_y = int(landmarks[2 * i + 1] * h)
+        cv2.circle(img, (point_x, point_y), tl+1, clors[i], -1)
+    tf = max(tl - 1, 1)  # font thickness
+    label = str(conf)[:5]
+    cv2.putText(img, label, (x1, y1 - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
+    return img
+def make_divisible(x, divisor):
+    # Returns x evenly divisible by divisor
+    return (x // divisor) * divisor
+def non_max_suppression_face(prediction, conf_thres=0.5, iou_thres=0.45, classes=None, agnostic=False, labels=()):
+    """Performs Non-Maximum Suppression (NMS) on inference results
+    Returns:
+         detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
+    """
+    nc = prediction.shape[2] - 15  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+    # Settings
+    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
+    # time_limit = 10.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+    # t = time.time()
+    output = [torch.zeros((0, 16), device=prediction.device)] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            l = labels[xi]
+            v = torch.zeros((len(l), nc + 15), device=x.device)
+            v[:, :4] = l[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(l)), l[:, 0].long() + 15] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+        # Compute conf
+        x[:, 15:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+        # Detections matrix nx6 (xyxy, conf, landmarks, cls)
+        if multi_label:
+            i, j = (x[:, 15:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 15, None], x[i, 5:15] ,j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 15:].max(1, keepdim=True)
+            x = torch.cat((box, conf, x[:, 5:15], j.float()), 1)[conf.view(-1) > conf_thres]
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+        # If none remain process next image
+        n = x.shape[0]  # number of boxes
+        if not n:
+            continue
+        # Batched NMS
+        c = x[:, 15:16] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        #if i.shape[0] > max_det:  # limit detections
+        #    i = i[:max_det]
+        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+        output[xi] = x[i]
+        # if (time.time() - t) > time_limit:
+        #     break  # time limit exceeded
+    return output
+class DetFace():
+    def __init__(self, pt_path, confThreshold=0.5, nmsThreshold=0.45, device='cuda'):
+        assert os.path.exists(pt_path)
+        self.inpSize = 416
+        self.conf_thres = confThreshold
+        self.iou_thres = nmsThreshold
+        self.test_device = torch.device(device if torch.cuda.is_available() else "cpu")
+        self.model = torch.jit.load(pt_path).to(self.test_device)
+        self.last_w = 416
+        self.last_h = 416
+        self.grids = None
+    @torch.no_grad()
+    def detect(self, srcimg):
+        # t0=time.time()
+        h0, w0 = srcimg.shape[:2]  # orig hw
+        r = self.inpSize / min(h0, w0)  # resize image to img_size
+        h1 = int(h0*r+31)//32*32
+        w1 = int(w0*r+31)//32*32
+        img = cv2.resize(srcimg, (w1,h1), interpolation=cv2.INTER_LINEAR)
+        # Convert
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # BGR to RGB
+        # Run inference
+        img = torch.from_numpy(img).to(self.test_device).permute(2,0,1)
+        img = img.float()/255  # uint8 to fp16/32  0-1
+        if img.ndimension() == 3:
+            img = img.unsqueeze(0)
+        # Inference
+        if h1 != self.last_h or w1 != self.last_w or self.grids is None:
+            grids = []
+            for scale in [8,16,32]:
+                ny = h1//scale
+                nx = w1//scale
+                yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
+                grid = torch.stack((xv, yv), 2).view((1,1,ny, nx, 2)).float()
+                grids.append(grid.to(self.test_device))
+            self.grids = grids
+            self.last_w = w1
+            self.last_h = h1
+        pred = self.model(img, self.grids).cpu()
+        # Apply NMS
+        det = non_max_suppression_face(pred, self.conf_thres, self.iou_thres)[0]
+        # Process detections
+        # det = pred[0]
+        bboxes = np.zeros((det.shape[0], 4))
+        kpss = np.zeros((det.shape[0], 5, 2))
+        scores = np.zeros((det.shape[0]))
+        # gn = torch.tensor([w0, h0, w0, h0]).to(pred)  # normalization gain whwh
+        # gn_lks = torch.tensor([w0, h0, w0, h0, w0, h0, w0, h0, w0, h0]).to(pred)  # normalization gain landmarks
+        det = det.cpu().numpy()
+        for j in range(det.shape[0]):
+            # xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(4).cpu().numpy()
+            bboxes[j, 0] = det[j, 0] * w0/w1
+            bboxes[j, 1] = det[j, 1] * h0/h1
+            bboxes[j, 2] = det[j, 2] * w0/w1 - bboxes[j, 0]
+            bboxes[j, 3] = det[j, 3] * h0/h1 - bboxes[j, 1]
+            scores[j] = det[j, 4]
+            # landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(5,2).cpu().numpy()
+            kpss[j, :, :] = det[j, 5:15].reshape(5, 2) * np.array([[w0/w1,h0/h1]])
+                # class_num = det[j, 15].cpu().numpy()
+                # orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
+        return bboxes, kpss, scores

hyvideo/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .pipelines import HunyuanVideoPipeline
2	+ from .schedulers import FlowMatchDiscreteScheduler

hyvideo/diffusion/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .pipeline_hunyuan_video import HunyuanVideoPipeline
2	+ from .pipeline_hunyuan_video_audio import HunyuanVideoAudioPipeline

hyvideo/diffusion/pipelines/pipeline_hunyuan_video.py ADDED Viewed

	@@ -0,0 +1,1438 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import torch
+import torch.distributed as dist
+import numpy as np
+from dataclasses import dataclass
+from packaging import version
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils import BaseOutput
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput
+from ...constants import PRECISION_TO_TYPE
+from ...vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from ...text_encoder import TextEncoder
+from ...modules import HYVideoDiffusionTransformer
+from mmgp import offload
+from ...utils.data_utils import black_image
+from einops import rearrange
+EXAMPLE_DOC_STRING = """"""
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True
+    )
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = (
+        guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    )
+    return noise_cfg
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class HunyuanVideoPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class HunyuanVideoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using HunyuanVideo.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`TextEncoder`]):
+            Frozen text-encoder.
+        text_encoder_2 ([`TextEncoder`]):
+            Frozen text-encoder_2.
+        transformer ([`HYVideoDiffusionTransformer`]):
+            A `HYVideoDiffusionTransformer` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = ["text_encoder_2"]
+    _exclude_from_cpu_offload = ["transformer"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: TextEncoder,
+        transformer: HYVideoDiffusionTransformer,
+        scheduler: KarrasDiffusionSchedulers,
+        text_encoder_2: Optional[TextEncoder] = None,
+        progress_bar_config: Dict[str, Any] = None,
+        args=None,
+    ):
+        super().__init__()
+        # ==========================================================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+        self.args = args
+        # ==========================================================================================
+        if (
+            hasattr(scheduler.config, "steps_offset")
+            and scheduler.config.steps_offset != 1
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate(
+                "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if (
+            hasattr(scheduler.config, "clip_sample")
+            and scheduler.config.clip_sample is True
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate(
+                "clip_sample not set", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder_2=text_encoder_2,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.noise_pertub = 0
+    def encode_prompt(
+        self,
+        prompt,
+        name,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        pixel_value_llava: Optional[torch.Tensor] = None,
+        uncond_pixel_value_llava: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+        text_encoder: Optional[TextEncoder] = None,
+        data_type: Optional[str] = "image",
+        semantic_images=None
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of videos that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.
+            uncond_pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.  Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            attention_mask (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            text_encoder (TextEncoder, *optional*):
+            data_type (`str`, *optional*):
+        """
+        if text_encoder is None:
+            text_encoder = self.text_encoder
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(text_encoder.model, lora_scale)
+            else:
+                scale_lora_layers(text_encoder.model, lora_scale)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, text_encoder.tokenizer)
+            text_inputs = text_encoder.text2tokens(prompt, data_type=data_type, name = name)
+            if pixel_value_llava is not None:
+                text_inputs['pixel_value_llava'] = pixel_value_llava
+                text_inputs['attention_mask'] = torch.cat([text_inputs['attention_mask'], torch.ones((1, 575 * len(pixel_value_llava))).to(text_inputs['attention_mask'])], dim=1)
+            if clip_skip is None:
+                prompt_outputs = text_encoder.encode(
+                    text_inputs, data_type=data_type, semantic_images=semantic_images, device=device
+                )
+                prompt_embeds = prompt_outputs.hidden_state
+            else:
+                prompt_outputs = text_encoder.encode(
+                    text_inputs,
+                    output_hidden_states=True,
+                    data_type=data_type,
+                    semantic_images=semantic_images,
+                    device=device,
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_outputs.hidden_states_list[-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = text_encoder.model.text_model.final_layer_norm(
+                    prompt_embeds
+                )
+            attention_mask = prompt_outputs.attention_mask
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(device)
+                bs_embed, seq_len = attention_mask.shape
+                attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
+                attention_mask = attention_mask.view(
+                    bs_embed * num_videos_per_prompt, seq_len
+                )
+        if text_encoder is not None:
+            prompt_embeds_dtype = text_encoder.dtype
+        elif self.transformer is not None:
+            prompt_embeds_dtype = self.transformer.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        if prompt_embeds.ndim == 2:
+            bs_embed, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
+        else:
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            prompt_embeds = prompt_embeds.view(
+                bs_embed * num_videos_per_prompt, seq_len, -1
+            )
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(
+                    uncond_tokens, text_encoder.tokenizer
+                )
+            # max_length = prompt_embeds.shape[1]
+            uncond_input = text_encoder.text2tokens(uncond_tokens, data_type=data_type, name = name)
+            if semantic_images is not None:
+                uncond_image = [black_image(img.size[0], img.size[1]) for img in semantic_images]
+            else:
+                uncond_image = None
+            if uncond_pixel_value_llava is not None:
+                uncond_input['pixel_value_llava'] = uncond_pixel_value_llava
+                uncond_input['attention_mask'] = torch.cat([uncond_input['attention_mask'], torch.ones((1, 575 * len(uncond_pixel_value_llava))).to(uncond_input['attention_mask'])], dim=1)
+            negative_prompt_outputs = text_encoder.encode(
+                uncond_input, data_type=data_type, semantic_images=uncond_image, device=device
+            )
+            negative_prompt_embeds = negative_prompt_outputs.hidden_state
+            negative_attention_mask = negative_prompt_outputs.attention_mask
+            if negative_attention_mask is not None:
+                negative_attention_mask = negative_attention_mask.to(device)
+                _, seq_len = negative_attention_mask.shape
+                negative_attention_mask = negative_attention_mask.repeat(
+                    1, num_videos_per_prompt
+                )
+                negative_attention_mask = negative_attention_mask.view(
+                    batch_size * num_videos_per_prompt, seq_len
+                )
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=prompt_embeds_dtype, device=device
+            )
+            if negative_prompt_embeds.ndim == 2:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(
+                    1, num_videos_per_prompt
+                )
+                negative_prompt_embeds = negative_prompt_embeds.view(
+                    batch_size * num_videos_per_prompt, -1
+                )
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(
+                    1, num_videos_per_prompt, 1
+                )
+                negative_prompt_embeds = negative_prompt_embeds.view(
+                    batch_size * num_videos_per_prompt, seq_len, -1
+                )
+        if text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(text_encoder.model, lora_scale)
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            attention_mask,
+            negative_attention_mask,
+        )
+    def decode_latents(self, latents, enable_tiling=True):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        if enable_tiling:
+            self.vae.enable_tiling()
+            image = self.vae.decode(latents, return_dict=False)[0]
+        else:
+            image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        if image.ndim == 4:
+            image = image.cpu().permute(0, 2, 3, 1).float()
+        else:
+            image = image.cpu().float()
+        return image
+    def prepare_extra_func_kwargs(self, func, kwargs):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        extra_step_kwargs = {}
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_step_kwargs[k] = v
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        video_length,
+        callback_steps,
+        pixel_value_llava=None,
+        uncond_pixel_value_llava=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        vae_ver="88-4c-sd",
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if video_length is not None:
+            if "884" in vae_ver:
+                if video_length != 1 and (video_length - 1) % 4 != 0:
+                    raise ValueError(
+                        f"`video_length` has to be 1 or a multiple of 4 but is {video_length}."
+                    )
+            elif "888" in vae_ver:
+                if video_length != 1 and (video_length - 1) % 8 != 0:
+                    raise ValueError(
+                        f"`video_length` has to be 1 or a multiple of 8 but is {video_length}."
+                    )
+        if callback_steps is not None and (
+            not isinstance(callback_steps, int) or callback_steps <= 0
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs
+            for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if pixel_value_llava is not None and uncond_pixel_value_llava is not None:
+            if len(pixel_value_llava) != len(uncond_pixel_value_llava):
+                raise ValueError(
+                    "`pixel_value_llava` and `uncond_pixel_value_llava` must have the same length when passed directly, but"
+                    f" got: `pixel_value_llava` {len(pixel_value_llava)} != `uncond_pixel_value_llava`"
+                    f" {len(uncond_pixel_value_llava)}."
+                )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps.to(device), num_inference_steps - t_start
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        num_inference_steps,
+        height,
+        width,
+        video_length,
+        dtype,
+        device,
+        timesteps,
+        generator,
+        latents=None,
+        denoise_strength=1.0,
+        img_latents=None,
+        i2v_mode=False,
+        i2v_condition_type=None,
+        i2v_stability=True,
+    ):
+        if i2v_mode and i2v_condition_type == "latent_concat":
+            num_channels_latents = (num_channels_latents - 1) // 2
+        shape = (
+            batch_size,
+            num_channels_latents,
+            video_length,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if i2v_mode and i2v_stability:
+            if img_latents.shape[2] == 1:
+                img_latents = img_latents.repeat(1, 1, video_length, 1, 1)
+            x0 = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            x1 = img_latents
+            t = torch.tensor([0.999]).to(device=device)
+            latents = x0 * t + x1 * (1 - t)
+            latents = latents.to(dtype=dtype)
+        if denoise_strength == 0:
+            if latents is None:
+                latents = randn_tensor(
+                    shape, generator=generator, device=device, dtype=dtype
+                )
+            else:
+                latents = latents.to(device)
+            original_latents = None
+            noise = None
+            timesteps = timesteps
+        else:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
+            if latents is None:
+                latents = noise
+                original_latents = None
+            else:
+                latents = latents.to(device)
+                latent_timestep = timesteps[:1]
+                frames_needed = noise.shape[2]
+                current_frames = latents.shape[2]
+                if frames_needed > current_frames:
+                    repeat_factor = frames_needed - current_frames
+                    additional_frame = torch.randn((latents.size(0), latents.size(1),repeat_factor, latents.size(3), latents.size(4)), dtype=latents.dtype, device=latents.device)
+                    latents = torch.cat((additional_frame, latents), dim=2)
+                    self.additional_frames = repeat_factor
+                elif frames_needed < current_frames:
+                    latents = latents[:, :, :frames_needed, :, :]
+                original_latents = latents.clone()
+                latents = latents * (1 - latent_timestep / 1000) + latent_timestep / 1000 * noise
+                print(f'debug:latent_timestep={latent_timestep}, latents-size={latents.shape}')
+        # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents, original_latents, noise, timesteps
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self,
+        w: torch.Tensor,
+        embedding_dim: int = 512,
+        dtype: torch.dtype = torch.float32,
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        # return self._guidance_scale > 1 and self.transformer.config.time_cond_proj_dim is None
+        return self._guidance_scale > 1
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        video_length: int,
+        name: Union[str, List[str]] = None,
+        data_type: str = "video",
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        pixel_value_ref=None,
+        # ref_latents: Optional[torch.Tensor] = None,
+        # uncond_ref_latents: Optional[torch.Tensor] = None,
+        pixel_value_llava: Optional[torch.Tensor] = None,
+        uncond_pixel_value_llava: Optional[torch.Tensor] = None,
+        bg_latents: Optional[torch.Tensor] = None,
+        audio_prompts: Optional[torch.Tensor] = None,
+        ip_cfg_scale: float = 0.0,
+        audio_strength: float = 1.0,
+        use_deepcache: int = 1,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        vae_ver: str = "88-4c-sd",
+        enable_tiling: bool = False,
+        n_tokens: Optional[int] = None,
+        video_val_flag: bool=False,
+        denoise_strength: float = 1.0,
+        mask = None,
+        embedded_guidance_scale: Optional[float] = None,
+        i2v_mode: bool = False,
+        i2v_condition_type: str = None,
+        i2v_stability: bool = True,
+        img_latents: Optional[torch.Tensor] = None,
+        semantic_images=None,
+        joint_pass = False,
+        cfg_star_rescale = False,
+        callback = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            video_length (`int`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            ref_latents (`torch.Tensor`, *optional*):
+                The image tensor for time-concat.
+            uncond_ref_latents (`torch.Tensor`, *optional*):
+                The image tensor for time-concat. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.
+            uncond_pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.  Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~HunyuanVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback_steps = kwargs.pop("callback_steps", None)
+        # if callback is not None:
+        #     deprecate(
+        #         "callback",
+        #         "1.0.0",
+        #         "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+        #     )
+        # if callback_steps is not None:
+        #     deprecate(
+        #         "callback_steps",
+        #         "1.0.0",
+        #         "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+        #     )
+        if self._interrupt:
+            return [None]
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        if pixel_value_ref != None:
+            pixel_value_ref = pixel_value_ref * 2 - 1.
+            pixel_value_ref_for_vae = rearrange(pixel_value_ref,"b c h w -> b c 1 h w")
+            ref_latents = self.vae.encode(pixel_value_ref_for_vae.clone()).latent_dist.sample()
+            uncond_ref_latents = self.vae.encode(torch.ones_like(pixel_value_ref_for_vae)).latent_dist.sample()
+            ref_latents.mul_(self.vae.config.scaling_factor)
+            uncond_ref_latents.mul_(self.vae.config.scaling_factor)
+        else:
+            ref_latents = None
+            uncond_ref_latents = None
+        # 0. Default height and width to unet
+        # height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        trans = self.transformer
+        if trans.enable_teacache:
+            teacache_multiplier = trans.teacache_multiplier
+            trans.accumulated_rel_l1_distance = 0
+            trans.rel_l1_thresh = 0.1 if teacache_multiplier < 2 else 0.15
+            # trans.teacache_start_step =  int(tea_cache_start_step_perc*num_inference_steps/100)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            video_length,
+            callback_steps,
+            negative_prompt,
+            pixel_value_llava,
+            uncond_pixel_value_llava,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            vae_ver=vae_ver,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = torch.device(f"cuda:{dist.get_rank()}") if dist.is_initialized() else self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_mask,
+            negative_prompt_mask,
+        ) = self.encode_prompt(
+            prompt,
+            name,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            pixel_value_llava=pixel_value_llava,
+            uncond_pixel_value_llava=uncond_pixel_value_llava,
+            prompt_embeds=prompt_embeds,
+            attention_mask=attention_mask,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_attention_mask=negative_attention_mask,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+            data_type=data_type,
+            semantic_images=semantic_images
+        )
+        if self.text_encoder_2 is not None:
+            (
+                prompt_embeds_2,
+                negative_prompt_embeds_2,
+                prompt_mask_2,
+                negative_prompt_mask_2,
+            ) = self.encode_prompt(
+                prompt,
+                name,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=None,
+                attention_mask=None,
+                negative_prompt_embeds=None,
+                negative_attention_mask=None,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+                text_encoder=self.text_encoder_2,
+                data_type=data_type,
+            )
+        else:
+            prompt_embeds_2 = None
+            negative_prompt_embeds_2 = None
+            prompt_mask_2 = None
+            negative_prompt_mask_2 = None
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            if prompt_mask is not None:
+                prompt_mask = torch.cat([negative_prompt_mask, prompt_mask])
+            if prompt_embeds_2 is not None:
+                prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            if prompt_mask_2 is not None:
+                prompt_mask_2 = torch.cat([negative_prompt_mask_2, prompt_mask_2])
+        if self.do_classifier_free_guidance:
+            if ref_latents is not None:
+                ref_latents = torch.cat([ref_latents, ref_latents], dim=0)
+                if prompt_mask[0].sum() > 575:
+                    prompt_mask[0] = torch.cat([torch.ones((1, prompt_mask[0].sum() - 575)).to(prompt_mask),
+                                                torch.zeros((1, prompt_mask.shape[1] - prompt_mask[0].sum() + 575)).to(prompt_mask)], dim=1)
+            if bg_latents is not None:
+                bg_latents = torch.cat([bg_latents, bg_latents], dim=0)
+            if audio_prompts is not None:
+                audio_prompts = torch.cat([torch.zeros_like(audio_prompts), audio_prompts], dim=0)
+        if ip_cfg_scale>0:
+            prompt_embeds = torch.cat([prompt_embeds, prompt_embeds[1:]])
+            prompt_embeds_2 = torch.cat([prompt_embeds_2, prompt_embeds_2[1:]])
+            prompt_mask = torch.cat([prompt_mask, prompt_mask[1:]], dim=0)
+            ref_latents = torch.cat([uncond_ref_latents, uncond_ref_latents, ref_latents[1:]], dim=0)
+        # 4. Prepare timesteps
+        extra_set_timesteps_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.set_timesteps, {"n_tokens": n_tokens}
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            **extra_set_timesteps_kwargs,
+        )
+        if "884" in vae_ver:
+            video_length = (video_length - 1) // 4 + 1
+        elif "888" in vae_ver:
+            video_length = (video_length - 1) // 8 + 1
+        else:
+            video_length = video_length
+        if self.transformer.mixed_precision:
+            latent_dtype = torch.float32
+        else:
+            latent_dtype = torch.bfloat16
+        if prompt_embeds != None:
+            prompt_embeds = prompt_embeds.to(torch.bfloat16)
+        if prompt_embeds_2 != None:
+            prompt_embeds_2 = prompt_embeds_2.to(torch.bfloat16)
+        # if prompt_mask != None:
+        #     prompt_mask = prompt_mask.to(torch.bfloat16)
+        # 5. Prepare latent variables
+        num_channels_latents  = self.transformer.config.in_channels
+        latents, original_latents, noise, timesteps = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_inference_steps,
+            height,
+            width,
+            video_length,
+            latent_dtype, #prompt_embeds.dtype,
+            device,
+            timesteps,
+            generator,
+            latents,
+            denoise_strength,
+            img_latents=img_latents,
+            i2v_mode=i2v_mode,
+            i2v_condition_type=i2v_condition_type,
+            i2v_stability=i2v_stability
+        )
+        if i2v_mode and i2v_condition_type == "latent_concat":
+            if img_latents.shape[2] == 1:
+                img_latents_concat = img_latents.repeat(1, 1, video_length, 1, 1)
+            else:
+                img_latents_concat = img_latents
+            img_latents_concat[:, :, 1:, ...] = 0
+            i2v_mask = torch.zeros(video_length)
+            i2v_mask[0] = 1
+            mask_concat = torch.ones(img_latents_concat.shape[0], 1, img_latents_concat.shape[2], img_latents_concat.shape[3],
+                                     img_latents_concat.shape[4]).to(device=img_latents.device)
+            mask_concat[:, :, 1:, ...] = 0
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step,
+            {"generator": generator, "eta": eta},
+        )
+        vae_precision = "fp16" # torch.float16
+        precision = "bf16" # torch.bfloat16
+        disable_autocast =  True
+        target_dtype = PRECISION_TO_TYPE[precision]
+        autocast_enabled = target_dtype != torch.float32 and not disable_autocast
+        vae_dtype = self.vae._model_dtype # PRECISION_TO_TYPE[vae_precision]
+        vae_autocast_enabled = vae_dtype != torch.float32 and not disable_autocast
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        start_scale = ip_cfg_scale  #  3.0
+        end_scale = 1.0
+        step_scale = (start_scale - end_scale) / (self._num_timesteps - 1 + 1e-3)
+        # print('sigmas used in generation:', self.scheduler.sigmas)
+        # print('inference timesteps used in generation:', timesteps)
+        # 8. Mask latents
+        mask_latents = None
+        if mask is not None:
+            target_video_length = mask.shape[0]
+            target_height = mask.shape[1]
+            target_width = mask.shape[2]
+            mask_length = (target_video_length - 1) // 4 + 1
+            mask_height = target_height // 8
+            mask_width = target_width // 8
+            mask = mask[...,0:1]
+            mask = mask.unsqueeze(0)
+            mask = rearrange(mask, "b t h w c -> b c t h w")
+            mask_latents = torch.nn.functional.interpolate(mask, size=(mask_length, mask_height, mask_width))
+            mask_latents = mask_latents.to(device)
+        if mask_latents is not None:
+            mask_latents_model_input = (
+                torch.cat([mask_latents] * 2)
+                if self.do_classifier_free_guidance
+                else mask_latents
+            )
+            print(f'maskinfo, mask={mask.shape}, mask_latents_model_input={mask_latents_model_input.shape} ')
+        if callback != None:
+            callback(-1, None, True)
+        load_latent = True
+        load_latent = False
+        multi_passes_free_guidance = not joint_pass
+        if load_latent:
+            timesteps = []
+        latent_items = 2 if self.do_classifier_free_guidance else 1
+        if ip_cfg_scale>0:
+            latent_items += 1
+        if self.transformer.enable_teacache:
+            self.transformer.previous_residual = [None] * latent_items
+        # if is_progress_bar:
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                offload.set_step_no_for_lora(self.transformer, i)
+                if self.interrupt:
+                    continue
+                if i2v_mode and i2v_condition_type == "token_replace":
+                    latents = torch.concat([img_latents, latents[:, :, 1:, :, :]], dim=2)
+                # expand the latents if we are doing classifier free guidance
+                if i2v_mode and i2v_condition_type == "latent_concat":
+                    latent_model_input = torch.concat([latents, img_latents_concat, mask_concat], dim=1)
+                else:
+                    latent_model_input = latents
+                latent_model_input =  torch.cat([latent_model_input] * latent_items) if latent_items > 1 else latent_model_input
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                if mask_latents is not None:
+                    original_latents_noise = original_latents * (1 - t / 1000.0) + t / 1000.0 * noise
+                    original_latent_noise_model_input = (
+                        torch.cat([original_latents_noise] * 2)
+                        if self.do_classifier_free_guidance
+                        else original_latents_noise
+                    )
+                    original_latent_noise_model_input = self.scheduler.scale_model_input(original_latent_noise_model_input, t)
+                    latent_model_input = mask_latents_model_input * latent_model_input + (1 - mask_latents_model_input) * original_latent_noise_model_input
+                t_expand = t.repeat(latent_model_input.shape[0])
+                guidance_expand = (
+                    torch.tensor(
+                        [embedded_guidance_scale] * latent_model_input.shape[0],
+                        dtype=torch.float32,
+                        device=device,
+                    ).to(latent_dtype)
+                    * 1000.0
+                    if embedded_guidance_scale is not None
+                    else None
+                )
+                # predict the noise residual
+                with torch.autocast(
+                    device_type="cuda", dtype=target_dtype, enabled=autocast_enabled
+                ):
+                    if self.do_classifier_free_guidance and multi_passes_free_guidance:
+                        for j in range(len(latent_model_input)):
+                            ret = self.transformer(  # For an input image (129, 192, 336) (1, 256, 256)
+                                latent_model_input[j].unsqueeze(0),  # [2, 16, 33, 24, 42]
+                                t_expand[j].unsqueeze(0),  # [2]
+                                text_states=prompt_embeds[j].unsqueeze(0),  # [2, 256, 4096]
+                                text_mask=prompt_mask[j].unsqueeze(0),  # [2, 256]
+                                text_states_2=prompt_embeds_2[j].unsqueeze(0),  # [2, 768]
+                                ref_latents=ref_latents[j].unsqueeze(0),
+                                freqs_cos=freqs_cis[0],  # [seqlen, head_dim]
+                                freqs_sin=freqs_cis[1],  # [seqlen, head_dim]
+                                guidance=guidance_expand,
+                                pipeline=self,
+                                x_id=j,
+                                step_no=i,
+                                bg_latents=bg_latents[j].unsqueeze(0) if bg_latents!=None else None,
+                                audio_prompts=audio_prompts[j].unsqueeze(0) if audio_prompts!=None else None,
+                                audio_strength=audio_strength,
+                                callback = callback,
+                            )
+                            if self._interrupt:
+                                return [None]
+                            if j==0:
+                                noise_pred_uncond= ret[0]
+                            elif j==1:
+                                noise_pred_text= ret[0]
+                            else:
+                                noise_pred_ip = ret[0]
+                            ret = None
+                    else:
+                        # if self.do_classifier_free_guidance:
+                        #     noise_pred_uncond = self.transformer(latent_model_input[:1], t_expand[:1], ref_latents=ref_latents[:1], text_states=prompt_embeds[:1],  text_mask=prompt_mask[:1],  text_states_2=prompt_embeds_2[:1], freqs_cos=freqs_cis[0],freqs_sin=freqs_cis[1], guidance=guidance_expand,return_dict=True)['x']
+                        #     noise_pred_text = self.transformer(latent_model_input[1:], t_expand[1:], ref_latents=ref_latents[1:], text_states=prompt_embeds[1:],  text_mask=prompt_mask[1:],  text_states_2=prompt_embeds_2[1:], freqs_cos=freqs_cis[0],freqs_sin=freqs_cis[1], guidance=guidance_expand,return_dict=True)['x']
+                        #     noise_pred = torch.cat([noise_pred_uncond, noise_pred_text], dim=0)
+                        # else:
+                        ret = self.transformer(  # For an input image (129, 192, 336) (1, 256, 256)
+                            latent_model_input,  # [2, 16, 33, 24, 42]
+                            t_expand,  # [2]
+                            text_states=prompt_embeds,  # [2, 256, 4096]
+                            text_mask=prompt_mask,  # [2, 256]
+                            text_states_2=prompt_embeds_2,  # [2, 768]
+                            ref_latents=ref_latents,
+                            freqs_cos=freqs_cis[0],  # [seqlen, head_dim]
+                            freqs_sin=freqs_cis[1],  # [seqlen, head_dim]
+                            guidance=guidance_expand,
+                            pipeline=self,
+                            step_no=i,
+                            bg_latents=bg_latents,
+                            audio_prompts=audio_prompts,
+                            audio_strength=audio_strength,
+                            callback = callback,
+                        )
+                        if self._interrupt:
+                            return [None]
+                        if self.do_classifier_free_guidance :
+                            if ip_cfg_scale > 0:
+                                noise_pred_uncond, noise_pred_text, noise_pred_ip = ret
+                            else:
+                                noise_pred_uncond, noise_pred_text = noise_pred = ret
+                        else:
+                            noise_pred = ret[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    if cfg_star_rescale:
+                        batch_size = 1
+                        positive_flat = noise_pred_text.view(batch_size, -1)
+                        negative_flat = noise_pred_uncond.view(batch_size, -1)
+                        dot_product = torch.sum(
+                            positive_flat * negative_flat, dim=1, keepdim=True
+                        )
+                        squared_norm = torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8
+                        positive_flat, negative_flat = None, None
+                        alpha = dot_product / squared_norm
+                        noise_pred_uncond *= alpha
+                    if ip_cfg_scale > 0:
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + start_scale * (noise_pred_ip-noise_pred_text)
+                        start_scale -= step_scale
+                        if i==0:
+                            print(f'i={i}, noise_pred shape={noise_pred.shape}')
+                    else:
+                        noise_pred = noise_pred_uncond + self.guidance_scale * ( noise_pred_text - noise_pred_uncond)
+                    if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg( noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale, )
+                # compute the previous noisy sample x_t -> x_t-1
+                if i2v_mode and i2v_condition_type == "token_replace":
+                    noise_pred = noise_pred.unsqueeze(0)
+                    latents = self.scheduler.step(
+                        noise_pred[:, :, 1:, :, :], t, latents[:, :, 1:, :, :], **extra_step_kwargs, return_dict=False
+                    )[0]
+                    latents = torch.concat(
+                        [img_latents, latents], dim=2
+                    )
+                else:
+                    latents = self.scheduler.step(
+                        noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                    )[0]
+                noise_pred_uncond, noise_pred_text, noise_pred, noise_pred_ip, ret = None, None, None, None, None
+                if callback is not None:
+                    callback(i, latents.squeeze(0), False)
+        if self.interrupt:
+            return [None]
+        # if load_latent:
+        #     latents = torch.load("latent.pt")
+        # else:
+        #     torch.save(latents, "latent.pt")
+        if mask_latents is not None:
+            latents = mask_latents * latents + (1 - mask_latents) * original_latents
+        if not output_type == "latent":
+            expand_temporal_dim = False
+            if len(latents.shape) == 4:
+                if isinstance(self.vae, AutoencoderKLCausal3D):
+                    latents = latents.unsqueeze(2)
+                    expand_temporal_dim = True
+            elif len(latents.shape) == 5:
+                pass
+            else:
+                raise ValueError(
+                    f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}."
+                )
+            if (
+                hasattr(self.vae.config, "shift_factor")
+                and self.vae.config.shift_factor
+            ):
+                latents = (
+                    latents / self.vae.config.scaling_factor
+                    + self.vae.config.shift_factor
+                )
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            with torch.autocast(
+                device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled
+            ):
+                if enable_tiling:
+                    self.vae.enable_tiling()
+                    image = self.vae.decode(
+                        latents, return_dict=False, generator=generator
+                    )[0]
+                else:
+                    image = self.vae.decode(
+                        latents, return_dict=False, generator=generator
+                    )[0]
+            if expand_temporal_dim or image.shape[2] == 1:
+                image = image.squeeze(2)
+        else:
+            image = latents
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().float()
+        if i2v_mode and i2v_condition_type == "latent_concat":
+            image = image[:, :, 4:, :, :]
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return image
+        return HunyuanVideoPipelineOutput(videos=image)

hyvideo/diffusion/pipelines/pipeline_hunyuan_video_audio.py ADDED Viewed

	@@ -0,0 +1,1362 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import numpy as np
+import torch
+from packaging import version
+from diffusers.utils import BaseOutput
+from dataclasses import dataclass
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ImageProjection
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from hyvideo.constants import PRECISION_TO_TYPE
+from hyvideo.vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from hyvideo.text_encoder import TextEncoder
+from einops import rearrange
+from ...modules import HYVideoDiffusionTransformer
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """"""
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class HunyuanVideoPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class HunyuanVideoAudioPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using HunyuanVideo.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`TextEncoder`]):
+            Frozen text-encoder.
+        text_encoder_2 ([`TextEncoder`]):
+            Frozen text-encoder_2.
+        transformer ([`HYVideoDiffusionTransformer`]):
+            A `HYVideoDiffusionTransformer` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = ["text_encoder_2"]
+    _exclude_from_cpu_offload = ["transformer"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: TextEncoder,
+        transformer: HYVideoDiffusionTransformer,
+        scheduler: KarrasDiffusionSchedulers,
+        text_encoder_2: Optional[TextEncoder] = None,
+        progress_bar_config: Dict[str, Any] = None,
+        args=None,
+    ):
+        super().__init__()
+        # ==========================================================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, '_progress_bar_config'):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+        self.args = args
+        # ==========================================================================================
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder_2=text_encoder_2
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def encode_prompt(
+        self,
+        prompt,
+        name,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        pixel_value_llava: Optional[torch.Tensor] = None,
+        uncond_pixel_value_llava: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+        text_encoder: Optional[TextEncoder] = None,
+        data_type: Optional[str] = "image",
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.
+            uncond_pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.  Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            attention_mask (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            text_encoder (TextEncoder, *optional*):
+        """
+        if text_encoder is None:
+            text_encoder = self.text_encoder
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(text_encoder.model, lora_scale)
+            else:
+                scale_lora_layers(text_encoder.model, lora_scale)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, text_encoder.tokenizer)
+            text_inputs = text_encoder.text2tokens(prompt, data_type=data_type, name=name)
+            if pixel_value_llava is not None:
+                text_inputs['pixel_value_llava'] = pixel_value_llava
+                text_inputs['attention_mask'] = torch.cat([text_inputs['attention_mask'], torch.ones((1, 575 * len(pixel_value_llava))).to(text_inputs['attention_mask'])], dim=1)
+            if clip_skip is None:
+                prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
+                prompt_embeds = prompt_outputs.hidden_state
+            else:
+                prompt_outputs = text_encoder.encode(text_inputs, output_hidden_states=True, data_type=data_type)
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_outputs.hidden_states_list[-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = text_encoder.model.text_model.final_layer_norm(prompt_embeds)
+            attention_mask = prompt_outputs.attention_mask
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(device)
+                bs_embed, seq_len = attention_mask.shape
+                attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
+                attention_mask = attention_mask.view(bs_embed * num_videos_per_prompt, seq_len)
+        if text_encoder is not None:
+            prompt_embeds_dtype = text_encoder.dtype
+        elif self.transformer is not None:
+            prompt_embeds_dtype = self.transformer.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        if prompt_embeds.ndim == 2:
+            bs_embed, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
+        else:
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, text_encoder.tokenizer)
+            uncond_input = text_encoder.text2tokens(uncond_tokens, data_type=data_type)
+            if uncond_pixel_value_llava is not None:
+                uncond_input['pixel_value_llava'] = uncond_pixel_value_llava
+                uncond_input['attention_mask'] = torch.cat([uncond_input['attention_mask'], torch.ones((1, 575 * len(uncond_pixel_value_llava))).to(uncond_input['attention_mask'])], dim=1)
+            negative_prompt_outputs = text_encoder.encode(uncond_input, data_type=data_type)
+            negative_prompt_embeds = negative_prompt_outputs.hidden_state
+            negative_attention_mask = negative_prompt_outputs.attention_mask
+            if negative_attention_mask is not None:
+                negative_attention_mask = negative_attention_mask.to(device)
+                _, seq_len = negative_attention_mask.shape
+                negative_attention_mask = negative_attention_mask.repeat(1, num_videos_per_prompt)
+                negative_attention_mask = negative_attention_mask.view(batch_size * num_videos_per_prompt, seq_len)
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            if negative_prompt_embeds.ndim == 2:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, -1)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        if text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(text_encoder.model, lora_scale)
+        return prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask
+    def encode_prompt_audio_text_base(
+        self,
+        prompt,
+        uncond_prompt,
+        pixel_value_llava,
+        uncond_pixel_value_llava,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+        text_encoder: Optional[TextEncoder] = None,
+        data_type: Optional[str] = "image",
+        name = "person"
+    ):
+        if text_encoder is None:
+            text_encoder = self.text_encoder
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(text_encoder.model, lora_scale)
+            else:
+                scale_lora_layers(text_encoder.model, lora_scale)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        prompt_embeds = None
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, text_encoder.tokenizer)
+            text_inputs = text_encoder.text2tokens(prompt, data_type=data_type, name=name) # data_type: video, text_inputs: {'input_ids', 'attention_mask'}
+            text_keys = ['input_ids', 'attention_mask']
+            if pixel_value_llava is not None:
+                text_inputs['pixel_value_llava'] = pixel_value_llava
+                text_inputs['attention_mask'] = torch.cat([text_inputs['attention_mask'], torch.ones((1, 575)).to(text_inputs['attention_mask'])], dim=1)
+            if clip_skip is None:
+                prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
+                prompt_embeds = prompt_outputs.hidden_state
+            else:
+                prompt_outputs = text_encoder.encode(text_inputs, output_hidden_states=True, data_type=data_type)
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_outputs.hidden_states_list[-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = text_encoder.model.text_model.final_layer_norm(prompt_embeds)
+            attention_mask = prompt_outputs.attention_mask
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(device)
+                bs_embed, seq_len = attention_mask.shape
+                attention_mask = attention_mask.repeat(1, num_images_per_prompt)
+                attention_mask = attention_mask.view(bs_embed * num_images_per_prompt, seq_len)
+        if text_encoder is not None:
+            prompt_embeds_dtype = text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        if prompt_embeds.ndim == 2:
+            bs_embed, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, -1)
+        else:
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, text_encoder.tokenizer)
+            # max_length = prompt_embeds.shape[1]
+            uncond_input = text_encoder.text2tokens(uncond_tokens, data_type=data_type, name=name)
+            # if hasattr(text_encoder.model.config, "use_attention_mask") and text_encoder.model.config.use_attention_mask:
+            #     attention_mask = uncond_input.attention_mask.to(device)
+            # else:
+            #     attention_mask = None
+            if uncond_pixel_value_llava is not None:
+                uncond_input['pixel_value_llava'] = uncond_pixel_value_llava
+                uncond_input['attention_mask'] = torch.cat([uncond_input['attention_mask'], torch.ones((1, 575)).to(uncond_input['attention_mask'])], dim=1)
+            negative_prompt_outputs = text_encoder.encode(uncond_input, data_type=data_type)
+            negative_prompt_embeds = negative_prompt_outputs.hidden_state
+            negative_attention_mask = negative_prompt_outputs.attention_mask
+            if negative_attention_mask is not None:
+                negative_attention_mask = negative_attention_mask.to(device)
+                _, seq_len = negative_attention_mask.shape
+                negative_attention_mask = negative_attention_mask.repeat(1, num_images_per_prompt)
+                negative_attention_mask = negative_attention_mask.view(batch_size * num_images_per_prompt, seq_len)
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            if negative_prompt_embeds.ndim == 2:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        if text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(text_encoder.model, lora_scale)
+        return prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask
+    def decode_latents(self, latents, enable_tiling=True):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        if enable_tiling:
+            self.vae.enable_tiling()
+            image = self.vae.decode(latents, return_dict=False)[0]
+            self.vae.disable_tiling()
+        else:
+            image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        if image.ndim==4: image = image.cpu().permute(0, 2, 3, 1).float()
+        else: image = image.cpu().float()
+        return image
+    def prepare_extra_func_kwargs(self, func, kwargs):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        extra_step_kwargs = {}
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_step_kwargs[k] = v
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        frame,
+        callback_steps,
+        pixel_value_llava=None,
+        uncond_pixel_value_llava=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        vae_ver='88-4c-sd'
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if frame is not None:
+            if '884' in vae_ver:
+                if frame!=1 and (frame-1)%4!=0:
+                    raise ValueError(f'`frame` has to be 1 or a multiple of 4 but is {frame}.')
+            elif '888' in vae_ver:
+                if frame!=1 and (frame-1)%8!=0:
+                    raise ValueError(f'`frame` has to be 1 or a multiple of 8 but is {frame}.')
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if pixel_value_llava is not None and uncond_pixel_value_llava is not None:
+            if len(pixel_value_llava) != len(uncond_pixel_value_llava):
+                raise ValueError(
+                    "`pixel_value_llava` and `uncond_pixel_value_llava` must have the same length when passed directly, but"
+                    f" got: `pixel_value_llava` {len(pixel_value_llava)} != `uncond_pixel_value_llava`"
+                    f" {len(uncond_pixel_value_llava)}."
+                )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps.to(device), num_inference_steps - t_start
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, frame, dtype, device, generator, latents=None, ref_latents=None, timestep=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            frame,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        if timestep is not None:
+            init_latents = ref_latents.clone().repeat(1,1,frame,1,1).to(device).to(dtype)
+            latents = latents
+        # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        # return self._guidance_scale > 1 and self.transformer.config.time_cond_proj_dim is None
+        return self._guidance_scale > 1
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        ref_latents: Union[torch.Tensor],                            # [1, 16, 1, h//8, w//8]
+        # uncond_ref_latents: Union[torch.Tensor],
+        pixel_value_llava: Union[torch.Tensor],                # [1, 3, 336, 336]
+        uncond_pixel_value_llava: Union[torch.Tensor],
+        pixel_value_ref: Union[torch.Tensor],
+        face_masks: Union[torch.Tensor],                              # [b f h w]
+        audio_prompts: Union[torch.Tensor],
+        uncond_audio_prompts: Union[torch.Tensor],
+        motion_exp: Union[torch.Tensor],
+        motion_pose: Union[torch.Tensor],
+        fps: Union[torch.Tensor],
+        height: int,
+        width: int,
+        video_length: int,
+        data_type: str = "video",
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        vae_ver: str = "88-4c-sd",
+        enable_tiling: bool = False,
+        n_tokens: Optional[int] = None,
+        embedded_guidance_scale: Optional[float] = None,
+        joint_pass = False,
+        cfg_star_rescale = False,
+        name = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            video_length (`int`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~HunyuanVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        if self._interrupt:
+            return [None]
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # num_inference_steps =  50
+        # 0. Default height and width to transformer
+        # height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        transformer = self.transformer
+        if transformer.enable_teacache:
+            teacache_multiplier = transformer.teacache_multiplier
+            transformer.accumulated_rel_l1_distance = 0
+            transformer.rel_l1_thresh = 0.1 if teacache_multiplier < 2 else 0.15
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            video_length,
+            callback_steps,
+            pixel_value_llava,
+            uncond_pixel_value_llava,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            vae_ver=vae_ver
+        )
+        self._guidance_scale = guidance_scale
+        self.start_cfg_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        # ========== Encode text prompt (image prompt) ==========
+        prompt_embeds, negative_prompt_embeds, prompt_mask, negative_prompt_mask = \
+            self.encode_prompt_audio_text_base(
+                prompt=prompt,
+                uncond_prompt=negative_prompt,
+                pixel_value_llava=pixel_value_llava,
+                uncond_pixel_value_llava=uncond_pixel_value_llava,
+                device=device,
+                num_images_per_prompt=num_videos_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+                text_encoder=self.text_encoder,
+                data_type=data_type,
+                name= name,
+                # **kwargs
+            )
+        if self.text_encoder_2 is not None:
+            prompt_embeds_2, negative_prompt_embeds_2, prompt_mask_2, negative_prompt_mask_2 = \
+                self.encode_prompt_audio_text_base(
+                    prompt=prompt,
+                    uncond_prompt=negative_prompt,
+                    pixel_value_llava=None,
+                    uncond_pixel_value_llava=None,
+                    device=device,
+                    num_images_per_prompt=num_videos_per_prompt,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    negative_prompt=negative_prompt,
+                    prompt_embeds=None,
+                    negative_prompt_embeds=None,
+                    lora_scale=lora_scale,
+                    clip_skip=self.clip_skip,
+                    text_encoder=self.text_encoder_2,
+                    # **kwargs
+                )
+        else:
+            prompt_embeds_2 = None
+            negative_prompt_embeds_2 = None
+            prompt_mask_2 = None
+            negative_prompt_mask_2 = None
+        if self.transformer.mixed_precision:
+            latent_dtype = torch.float32
+        else:
+            latent_dtype = torch.bfloat16
+        if prompt_embeds != None:
+            prompt_embeds = prompt_embeds.to(torch.bfloat16)
+        if negative_prompt_embeds != None:
+            negative_prompt_embeds = negative_prompt_embeds.to(torch.bfloat16)
+        if prompt_embeds_2 != None:
+            prompt_embeds_2 = prompt_embeds_2.to(torch.bfloat16)
+        if negative_prompt_embeds_2 != None:
+            negative_prompt_embeds_2 = negative_prompt_embeds_2.to(torch.bfloat16)
+        if audio_prompts != None:
+            audio_prompts = audio_prompts.to(torch.bfloat16)
+        if face_masks!= None:
+            face_masks = face_masks.to(torch.bfloat16)
+        if ref_latents != None:
+            ref_latents = ref_latents.to(torch.bfloat16)
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds_input = torch.cat([negative_prompt_embeds, prompt_embeds])
+            if prompt_mask is not None:
+                prompt_mask_input = torch.cat([negative_prompt_mask, prompt_mask])
+            if prompt_embeds_2 is not None:
+                prompt_embeds_2_input = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            if prompt_mask_2 is not None:
+                prompt_mask_2_input = torch.cat([negative_prompt_mask_2, prompt_mask_2])
+        if self.do_classifier_free_guidance and ref_latents != None:
+            ref_latents = torch.cat([ref_latents, ref_latents], dim=0)
+        # 4. Prepare timesteps
+        extra_set_timesteps_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.set_timesteps, {"n_tokens": n_tokens}
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas, **extra_set_timesteps_kwargs,
+        )
+        video_length = audio_prompts.shape[1] // 4 * 4 + 1
+        if "884" in vae_ver:
+            video_length = (video_length - 1) // 4 + 1
+        elif "888" in vae_ver:
+            video_length = (video_length - 1) // 8 + 1
+        else:
+            video_length = video_length
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        infer_length = (audio_prompts.shape[1] // 128 + 1) * 32 + 1
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            infer_length,
+            latent_dtype, #prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            ref_latents[-1:] if ref_latents != None else None,
+            timesteps[:1]
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step, {"generator": generator, "eta": eta},
+        )
+        vae_precision = "fp16" # torch.float16
+        precision = "bf16" # torch.bfloat16
+        disable_autocast =  True
+        target_dtype = PRECISION_TO_TYPE[precision]
+        autocast_enabled = (target_dtype != torch.float32) and not disable_autocast
+        vae_dtype = self.vae._model_dtype #PRECISION_TO_TYPE[vae_precision]
+        vae_autocast_enabled = (vae_dtype != torch.float32) and not disable_autocast
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        latents_all = latents.clone()
+        pad_audio_length = (audio_prompts.shape[1] // 128 + 1) * 128 + 4 - audio_prompts.shape[1]
+        audio_prompts_all = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :pad_audio_length])], dim=1)
+        shift = 0
+        shift_offset = 10
+        frames_per_batch = 33
+        self.cache_tensor = None
+        """ If the total length is shorter than 129, shift is not required """
+        if video_length == 33 or infer_length == 33:
+            infer_length = 33
+            shift_offset = 0
+            latents_all = latents_all[:, :, :33]
+            audio_prompts_all = audio_prompts_all[:, :132]
+        joint_pass = joint_pass or not self.do_classifier_free_guidance
+        if callback != None:
+            callback(-1, None, True, override_num_inference_steps = num_inference_steps)
+        latent_items = 2 if self.do_classifier_free_guidance else 1
+        fps = torch.from_numpy(np.array(fps)).unsqueeze(0).to(dtype=torch.float16)
+        if self._interrupt:
+            return [None]
+        if transformer.enable_teacache:
+            cache_size = round( infer_length / frames_per_batch )
+            transformer.previous_residual = [None] * latent_items
+            cache_all_previous_residual =  [None] * latent_items
+            cache_all_previous_modulated_input = None
+            cache_should_calc = [True] * cache_size
+            cache_accumulated_rel_l1_distance = [0.] * cache_size
+            cache_teacache_skipped_steps = [0] * cache_size
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # init
+                pred_latents = torch.zeros_like(
+                    latents_all,
+                    dtype=latents_all.dtype,
+                )
+                counter = torch.zeros(
+                    (latents_all.shape[0], latents_all.shape[1], infer_length, 1, 1),
+                    dtype=latents_all.dtype,
+                ).to(device=latents_all.device)
+                cache_slot_no = 0
+                for index_start in range(0, infer_length, frames_per_batch):
+                    self.scheduler._step_index = None
+                    index_start = index_start - shift
+                    idx_list = [ii % latents_all.shape[2] for ii in range(index_start, index_start + frames_per_batch)]
+                    latents = latents_all[:, :, idx_list].clone()
+                    idx_list_audio = [ii % audio_prompts_all.shape[1] for ii in range(index_start * 4, (index_start + frames_per_batch) * 4 - 3)]
+                    audio_prompts = audio_prompts_all[:, idx_list_audio].clone()
+                    # expand the latents if we are doing classifier free guidance
+                    if self.do_classifier_free_guidance:
+                        latent_model_input = torch.cat([latents] * 2)
+                    else:
+                        latent_model_input = latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    embedded_hw = (latent_model_input.shape[-1] // 2) * (latent_model_input.shape[-2] // 2) * 3072
+                    img_ref_len = (latent_model_input.shape[-1] // 2)  * (latent_model_input.shape[-2] // 2) * ( 1)
+                    img_all_len = (latents_all.shape[-1] // 2)  * (latents_all.shape[-2] // 2) * latents_all.shape[-3]
+                    if transformer.enable_teacache and cache_size > 1:
+                        for l in range(latent_items):
+                            if cache_all_previous_residual[l] != None:
+                                bsz = cache_all_previous_residual[l].shape[0]
+                                transformer.previous_residual[l][:, img_ref_len:] = cache_all_previous_residual[l].reshape(1, -1, embedded_hw) [:, idx_list].reshape(1, -1, 3072)
+                        if cache_all_previous_modulated_input != None:
+                            transformer.previous_modulated_input[:, img_ref_len:] = cache_all_previous_modulated_input.reshape(1, -1, embedded_hw) [:, idx_list].reshape(1, -1, 3072)
+                        transformer.should_calc = cache_should_calc[cache_slot_no]
+                        transformer.accumulated_rel_l1_distance = cache_accumulated_rel_l1_distance[cache_slot_no]
+                        transformer.teacache_skipped_steps = cache_teacache_skipped_steps[cache_slot_no]
+                    if self.do_classifier_free_guidance:
+                        if i < num_inference_steps * 0.2 :
+                            self._guidance_scale = (1 - i / len(timesteps)) * (self.start_cfg_scale - 2) + 2
+                            audio_prompts_input = torch.cat([uncond_audio_prompts, audio_prompts], dim=0)
+                            face_masks_input = torch.cat([face_masks * 0.6] * 2, dim=0)
+                        else:
+                            # define 10-50 step cfg
+                            self._guidance_scale = (1 - i / len(timesteps)) * (6.5 - 3.5) + 3.5  # 5-2 +2
+                            prompt_embeds_input = torch.cat([prompt_embeds, prompt_embeds])
+                            if prompt_mask is not None:
+                                prompt_mask_input = torch.cat([prompt_mask, prompt_mask])
+                            if prompt_embeds_2 is not None:
+                                prompt_embeds_2_input = torch.cat([prompt_embeds_2, prompt_embeds_2])
+                            if prompt_mask_2 is not None:
+                                prompt_mask_2_input = torch.cat([prompt_mask_2, prompt_mask_2])
+                            audio_prompts_input = torch.cat([uncond_audio_prompts, audio_prompts], dim=0)
+                            face_masks_input = torch.cat([face_masks] * 2, dim=0)
+                        motion_exp_input = torch.cat([motion_exp] * 2, dim=0)
+                        motion_pose_input = torch.cat([motion_pose] * 2, dim=0)
+                        fps_input = torch.cat([fps] * 2, dim=0)
+                    else:
+                        audio_prompts_input = audio_prompts
+                        face_masks_input = face_masks
+                        motion_exp_input = motion_exp
+                        motion_pose_input = motion_pose
+                        fps_input = fps
+                    t_expand = t.repeat(latent_model_input.shape[0])
+                    guidance_expand = None
+                    with torch.autocast(device_type="cuda", dtype=target_dtype, enabled=autocast_enabled):
+                        additional_kwargs = {
+                            "pipeline": self,
+                            "step_no": i,
+                        }
+                        if joint_pass:
+                            additional_kwargs.update({
+                                "motion_exp": motion_exp_input,
+                                "motion_pose": motion_pose_input,
+                                "fps": fps_input,
+                                "audio_prompts": audio_prompts_input,
+                                "face_mask": face_masks_input
+                            })
+                            noise_pred = self.transformer(latent_model_input, t_expand, ref_latents=ref_latents, text_states=prompt_embeds_input, text_mask=prompt_mask_input, text_states_2=prompt_embeds_2_input, freqs_cos=freqs_cis[0], freqs_sin=freqs_cis[1], guidance=guidance_expand, **additional_kwargs,)
+                            if self._interrupt:
+                                return [None]
+                            if self.do_classifier_free_guidance:
+                                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        else:
+                            additional_kwargs.update({
+                                "motion_exp": motion_exp_input[:1],
+                                "motion_pose": motion_pose_input[:1],
+                                "fps": fps_input[:1],
+                                "audio_prompts": audio_prompts_input[:1],
+                                "face_mask": face_masks_input[:1]
+                            })
+                            noise_pred_uncond = self.transformer(latent_model_input[:1], t_expand[:1], ref_latents=ref_latents[:1], text_states=prompt_embeds_input[:1], text_mask=prompt_mask_input[:1], text_states_2=prompt_embeds_2_input[:1], freqs_cos=freqs_cis[0], freqs_sin=freqs_cis[1], guidance=guidance_expand, x_id = 0, **additional_kwargs,)
+                            if self._interrupt:
+                                return [None]
+                            noise_pred_uncond = noise_pred_uncond[0]
+                            additional_kwargs.update({
+                                "motion_exp": motion_exp_input[1:],
+                                "motion_pose": motion_pose_input[1:],
+                                "fps": fps_input[1:],
+                                "audio_prompts": audio_prompts_input[1:],
+                                "face_mask": face_masks_input[1:]
+                            })
+                            noise_pred_text = self.transformer(latent_model_input[1:], t_expand[1:], ref_latents=ref_latents[1:], text_states=prompt_embeds_input[1:], text_mask=prompt_mask_input[1:], text_states_2=prompt_embeds_2_input[1:], freqs_cos=freqs_cis[0], freqs_sin=freqs_cis[1], guidance=guidance_expand, x_id = 1, **additional_kwargs,)
+                            if self._interrupt:
+                                return [None]
+                            noise_pred_text = noise_pred_text[0]
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        if cfg_star_rescale:
+                            batch_size = 1
+                            positive_flat = noise_pred_text.view(batch_size, -1)
+                            negative_flat = noise_pred_uncond.view(batch_size, -1)
+                            dot_product = torch.sum(
+                                positive_flat * negative_flat, dim=1, keepdim=True
+                            )
+                            squared_norm = torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8
+                            positive_flat, negative_flat = None, None
+                            alpha = dot_product / squared_norm
+                            noise_pred_uncond *= alpha
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+                    noise_pred_text, noise_pred_uncond = None, None
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                    noise_pred = None
+                    latents = latents.to(torch.bfloat16)
+                    for iii in range(frames_per_batch):
+                        p = (index_start + iii) % pred_latents.shape[2]
+                        pred_latents[:, :, p] += latents[:, :, iii]
+                        counter[:, :, p] += 1
+                    if transformer.enable_teacache and cache_size > 1:
+                        for l in range(latent_items):
+                            if transformer.previous_residual[l] != None:
+                                bsz = transformer.previous_residual[l].shape[0]
+                                if cache_all_previous_residual[l] == None:
+                                    cache_all_previous_residual[l] = torch.zeros((bsz, img_all_len, 3072 ), device=transformer.previous_residual[l].device, dtype=transformer.previous_residual[l].dtype)
+                                cache_all_previous_residual[l].reshape(bsz, -1, embedded_hw)[:, idx_list] = transformer.previous_residual[l][:, img_ref_len:].reshape(bsz, -1, embedded_hw)
+                        if transformer.previous_modulated_input  != None:
+                            if cache_all_previous_modulated_input == None:
+                                cache_all_previous_modulated_input = torch.zeros((1, img_all_len, 3072 ), device=transformer.previous_modulated_input.device, dtype=transformer.previous_modulated_input.dtype)
+                            cache_all_previous_modulated_input.reshape(1, -1, embedded_hw)[:, idx_list] = transformer.previous_modulated_input[:, img_ref_len:].reshape(1, -1, embedded_hw)
+                        cache_should_calc[cache_slot_no]  = transformer.should_calc
+                        cache_accumulated_rel_l1_distance[cache_slot_no]  = transformer.accumulated_rel_l1_distance
+                        cache_teacache_skipped_steps[cache_slot_no]  = transformer.teacache_skipped_steps
+                    cache_slot_no += 1
+                shift += shift_offset
+                shift = shift % frames_per_batch
+                pred_latents  = pred_latents / counter
+                latents_all = pred_latents
+                if callback is not None:
+                    callback(i, latents_all.squeeze(0), False)
+        latents = latents_all.float()[:, :, :video_length]
+        if not output_type == "latent":
+            expand_temporal_dim = False
+            if len(latents.shape) == 4:
+                if isinstance(self.vae, AutoencoderKLCausal3D):
+                    latents = latents.unsqueeze(2)
+                    expand_temporal_dim = True
+            elif len(latents.shape) == 5:
+                pass
+            else:
+                raise ValueError(
+                    f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}.")
+            if hasattr(self.vae.config, 'shift_factor') and self.vae.config.shift_factor:
+                latents = latents / self.vae.config.scaling_factor + self.vae.config.shift_factor
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            with torch.autocast(device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled):
+                image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
+            if image is None:
+                return (None, )
+            if expand_temporal_dim or image.shape[2] == 1:
+                image = image.squeeze(2)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().float()
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return image
+        return HunyuanVideoPipelineOutput(videos=image)

hyvideo/diffusion/schedulers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .scheduling_flow_match_discrete import FlowMatchDiscreteScheduler

hyvideo/diffusion/schedulers/scheduling_flow_match_discrete.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput, logging
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+@dataclass
+class FlowMatchDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.FloatTensor
+class FlowMatchDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+        reverse (`bool`, defaults to `True`):
+            Whether to reverse the timestep schedule.
+    """
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        reverse: bool = True,
+        solver: str = "euler",
+        n_tokens: Optional[int] = None,
+    ):
+        sigmas = torch.linspace(1, 0, num_train_timesteps + 1)
+        if not reverse:
+            sigmas = sigmas.flip(0)
+        self.sigmas = sigmas
+        # the value fed to model
+        self.timesteps = (sigmas[:-1] * num_train_timesteps).to(dtype=torch.float32)
+        self._step_index = None
+        self._begin_index = None
+        self.supported_solver = ["euler"]
+        if solver not in self.supported_solver:
+            raise ValueError(
+                f"Solver {solver} not supported. Supported solvers: {self.supported_solver}"
+            )
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        n_tokens: int = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+        """
+        self.num_inference_steps = num_inference_steps
+        sigmas = torch.linspace(1, 0, num_inference_steps + 1)
+        sigmas = self.sd3_time_shift(sigmas)
+        if not self.config.reverse:
+            sigmas = 1 - sigmas
+        self.sigmas = sigmas
+        self.timesteps = (sigmas[:-1] * self.config.num_train_timesteps).to(
+            dtype=torch.float32, device=device
+        )
+        # Reset step index
+        self._step_index = None
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def scale_model_input(
+        self, sample: torch.Tensor, timestep: Optional[int] = None
+    ) -> torch.Tensor:
+        return sample
+    def sd3_time_shift(self, t: torch.Tensor):
+        return (self.config.shift * t) / (1 + (self.config.shift - 1) * t)
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        dt = self.sigmas[self.step_index + 1] - self.sigmas[self.step_index]
+        if self.config.solver == "euler":
+            prev_sample = sample + model_output.to(torch.float32) * dt
+        else:
+            raise ValueError(
+                f"Solver {self.config.solver} not supported. Supported solvers: {self.supported_solver}"
+            )
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return FlowMatchDiscreteSchedulerOutput(prev_sample=prev_sample)
+    def __len__(self):
+        return self.config.num_train_timesteps

hyvideo/hunyuan.py ADDED Viewed

	@@ -0,0 +1,1062 @@

+import os
+import time
+import random
+import functools
+from typing import List, Optional, Tuple, Union
+from pathlib import Path
+from einops import rearrange
+import torch
+import torch.distributed as dist
+from hyvideo.constants import PROMPT_TEMPLATE, NEGATIVE_PROMPT, PRECISION_TO_TYPE, NEGATIVE_PROMPT_I2V
+from hyvideo.vae import load_vae
+from hyvideo.modules import load_model
+from hyvideo.text_encoder import TextEncoder
+from hyvideo.utils.data_utils import align_to, get_closest_ratio, generate_crop_size_list
+from hyvideo.modules.posemb_layers import get_nd_rotary_pos_embed, get_nd_rotary_pos_embed_new
+from hyvideo.diffusion.schedulers import FlowMatchDiscreteScheduler
+from hyvideo.diffusion.pipelines import HunyuanVideoPipeline
+from hyvideo.diffusion.pipelines import HunyuanVideoAudioPipeline
+from PIL import Image
+import numpy as np
+import torchvision.transforms as transforms
+import cv2
+from wan.utils.utils import resize_lanczos, calculate_new_dimensions
+from hyvideo.data_kits.audio_preprocessor import encode_audio, get_facemask
+from transformers import WhisperModel
+from transformers import AutoFeatureExtractor
+from hyvideo.data_kits.face_align import AlignImage
+import librosa
+def get_audio_feature(feature_extractor, audio_path, duration):
+    audio_input, sampling_rate = librosa.load(audio_path, duration=duration, sr=16000)
+    assert sampling_rate == 16000
+    audio_features = []
+    window = 750*640
+    for i in range(0, len(audio_input), window):
+        audio_feature = feature_extractor(audio_input[i:i+window],
+                                        sampling_rate=sampling_rate,
+                                        return_tensors="pt",
+                                        device="cuda"
+                                        ).input_features
+        audio_features.append(audio_feature)
+    audio_features = torch.cat(audio_features, dim=-1)
+    return audio_features, len(audio_input) // 640
+def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
+    crop_h, crop_w = crop_img.shape[:2]
+    target_w, target_h = size
+    scale_h, scale_w = target_h / crop_h, target_w / crop_w
+    if scale_w > scale_h:
+        resize_h = int(target_h*resize_ratio)
+        resize_w = int(crop_w / crop_h * resize_h)
+    else:
+        resize_w = int(target_w*resize_ratio)
+        resize_h = int(crop_h / crop_w * resize_w)
+    crop_img = cv2.resize(crop_img, (resize_w, resize_h))
+    pad_left = (target_w - resize_w) // 2
+    pad_top = (target_h - resize_h) // 2
+    pad_right = target_w - resize_w - pad_left
+    pad_bottom = target_h - resize_h - pad_top
+    crop_img = cv2.copyMakeBorder(crop_img, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=color)
+    return crop_img
+def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
+    num_images, num_image_patches, embed_dim = image_features.shape
+    batch_size, sequence_length = input_ids.shape
+    left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+    # 1. Create a mask to know where special image tokens are
+    special_image_token_mask = input_ids == self.config.image_token_index
+    num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+    # Compute the maximum embed dimension
+    max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+    batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+    # 2. Compute the positions where text should be written
+    # Calculate new positions for text tokens in merged image-text sequence.
+    # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+    # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+    # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+    new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+    nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+    if left_padding:
+        new_token_positions += nb_image_pad[:, None]  # offset for left padding
+    text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+    # 3. Create the full embedding, already padded to the maximum position
+    final_embedding = torch.zeros(
+        batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+    )
+    final_attention_mask = torch.zeros(
+        batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+    )
+    if labels is not None:
+        final_labels = torch.full(
+            (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
+        )
+    # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+    # set the corresponding tensors into their correct target device.
+    target_device = inputs_embeds.device
+    batch_indices, non_image_indices, text_to_overwrite = (
+        batch_indices.to(target_device),
+        non_image_indices.to(target_device),
+        text_to_overwrite.to(target_device),
+    )
+    attention_mask = attention_mask.to(target_device)
+    # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+    # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+    final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+    final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+    if labels is not None:
+        final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+    # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
+    image_to_overwrite = torch.full(
+        (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
+    )
+    image_to_overwrite[batch_indices, text_to_overwrite] = False
+    image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+    if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+        raise ValueError(
+            f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+            f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+        )
+    final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+    final_attention_mask |= image_to_overwrite
+    position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+    # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
+    batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
+    indices_to_mask = new_token_positions[batch_indices, pad_indices]
+    final_embedding[batch_indices, indices_to_mask] = 0
+    if labels is None:
+        final_labels = None
+    return final_embedding, final_attention_mask, final_labels, position_ids
+def patched_llava_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: torch.FloatTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    vision_feature_layer: Optional[int] = None,
+    vision_feature_select_strategy: Optional[str] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    num_logits_to_keep: int = 0,
+):
+    from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    vision_feature_layer = (
+        vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+    )
+    vision_feature_select_strategy = (
+        vision_feature_select_strategy
+        if vision_feature_select_strategy is not None
+        else self.config.vision_feature_select_strategy
+    )
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+    if pixel_values is not None and inputs_embeds is not None:
+        raise ValueError(
+            "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+        )
+    if inputs_embeds is None:
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+    image_features = None
+    if pixel_values is not None:
+        image_features = self.get_image_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+        )
+    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+        image_features, inputs_embeds, input_ids, attention_mask, labels
+    )
+    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
+    outputs = self.language_model(
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        num_logits_to_keep=num_logits_to_keep,
+    )
+    logits = outputs[0]
+    loss = None
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+    return LlavaCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        image_hidden_states=image_features if pixel_values is not None else None,
+    )
+def adapt_model(model, audio_block_name):
+    modules_dict= { k: m for k, m in model.named_modules()}
+    for model_layer, avatar_layer in model.double_stream_map.items():
+        module = modules_dict[f"{audio_block_name}.{avatar_layer}"]
+        target = modules_dict[f"double_blocks.{model_layer}"]
+        setattr(target, "audio_adapter", module )
+    delattr(model, audio_block_name)
+class DataPreprocess(object):
+    def __init__(self):
+        self.llava_size = (336, 336)
+        self.llava_transform = transforms.Compose(
+            [
+                transforms.Resize(self.llava_size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.4082107), (0.26862954, 0.26130258, 0.27577711)),
+            ]
+        )
+    def get_batch(self, image , size, pad = False):
+        image = np.asarray(image)
+        if pad:
+            llava_item_image = pad_image(image.copy(), self.llava_size)
+        else:
+            llava_item_image = image.copy()
+        uncond_llava_item_image = np.ones_like(llava_item_image) * 255
+        if pad:
+            cat_item_image = pad_image(image.copy(), size)
+        else:
+            cat_item_image = image.copy()
+        llava_item_tensor = self.llava_transform(Image.fromarray(llava_item_image.astype(np.uint8)))
+        uncond_llava_item_tensor = self.llava_transform(Image.fromarray(uncond_llava_item_image))
+        cat_item_tensor = torch.from_numpy(cat_item_image.copy()).permute((2, 0, 1)) / 255.0
+        # batch = {
+        #     "pixel_value_llava": llava_item_tensor.unsqueeze(0),
+        #     "uncond_pixel_value_llava": uncond_llava_item_tensor.unsqueeze(0),
+        #     'pixel_value_ref': cat_item_tensor.unsqueeze(0),
+        # }
+        return llava_item_tensor.unsqueeze(0), uncond_llava_item_tensor.unsqueeze(0), cat_item_tensor.unsqueeze(0)
+class Inference(object):
+    def __init__(
+        self,
+        i2v,
+        custom,
+        avatar,
+        enable_cfg,
+        vae,
+        vae_kwargs,
+        text_encoder,
+        model,
+        text_encoder_2=None,
+        pipeline=None,
+        feature_extractor=None,
+        wav2vec=None,
+        align_instance=None,
+        device=None,
+    ):
+        self.i2v = i2v
+        self.custom = custom
+        self.avatar = avatar
+        self.enable_cfg = enable_cfg
+        self.vae = vae
+        self.vae_kwargs = vae_kwargs
+        self.text_encoder = text_encoder
+        self.text_encoder_2 = text_encoder_2
+        self.model = model
+        self.pipeline = pipeline
+        self.feature_extractor=feature_extractor
+        self.wav2vec=wav2vec
+        self.align_instance=align_instance
+        self.device = "cuda"
+    @classmethod
+    def from_pretrained(cls, model_filepath, text_encoder_filepath, dtype = torch.bfloat16, VAE_dtype = torch.float16, mixed_precision_transformer =torch.bfloat16 , **kwargs):
+        device = "cuda"
+        import transformers
+        transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.forward = patched_llava_forward # force legacy behaviour to be able to use tansformers v>(4.47)
+        transformers.models.llava.modeling_llava.LlavaForConditionalGeneration._merge_input_ids_with_image_features = _merge_input_ids_with_image_features
+        torch.set_grad_enabled(False)
+        text_len = 512
+        latent_channels = 16
+        precision = "bf16"
+        vae_precision = "fp32" if VAE_dtype == torch.float32 else "bf16"
+        embedded_cfg_scale = 6
+        filepath = model_filepath[0]
+        i2v_condition_type = None
+        i2v_mode = "i2v" in filepath
+        custom = False
+        custom_audio = False
+        avatar = False
+        if i2v_mode:
+            model_id = "HYVideo-T/2"
+            i2v_condition_type = "token_replace"
+        elif "custom" in filepath:
+            if "audio" in filepath:
+                model_id = "HYVideo-T/2-custom-audio"
+                custom_audio = True
+            elif "edit" in filepath:
+                model_id = "HYVideo-T/2-custom-edit"
+            else:
+                model_id = "HYVideo-T/2-custom"
+            custom = True
+        elif "avatar" in filepath :
+            model_id = "HYVideo-T/2-avatar"
+            text_len = 256
+            avatar = True
+        else:
+            model_id = "HYVideo-T/2-cfgdistill"
+        if i2v_mode and i2v_condition_type == "latent_concat":
+            in_channels = latent_channels * 2 + 1
+            image_embed_interleave = 2
+        elif i2v_mode and i2v_condition_type == "token_replace":
+            in_channels = latent_channels
+            image_embed_interleave = 4
+        else:
+            in_channels = latent_channels
+            image_embed_interleave = 1
+        out_channels = latent_channels
+        pinToMemory = kwargs.pop("pinToMemory", False)
+        partialPinning = kwargs.pop("partialPinning", False)
+        factor_kwargs = kwargs | {"device": "meta", "dtype": PRECISION_TO_TYPE[precision]}
+        if embedded_cfg_scale and i2v_mode:
+            factor_kwargs["guidance_embed"] = True
+        model = load_model(
+            model = model_id,
+            i2v_condition_type = i2v_condition_type,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            factor_kwargs=factor_kwargs,
+        )
+        from mmgp import offload
+        # model = Inference.load_state_dict(args, model, model_filepath)
+        # model_filepath ="c:/temp/hc/mp_rank_00_model_states_video.pt"
+        offload.load_model_data(model, model_filepath, pinToMemory = pinToMemory, partialPinning = partialPinning)
+        pass
+        # offload.save_model(model, "hunyuan_video_avatar_edit_720_bf16.safetensors")
+        # offload.save_model(model, "hunyuan_video_avatar_edit_720_quanto_bf16_int8.safetensors", do_quantize= True)
+        model.mixed_precision = mixed_precision_transformer
+        if model.mixed_precision :
+            model._lock_dtype = torch.float32
+            model.lock_layers_dtypes(torch.float32)
+        model.eval()
+        # ============================= Build extra models ========================
+        # VAE
+        if custom or avatar:
+            vae_configpath = "ckpts/hunyuan_video_custom_VAE_config.json"
+            vae_filepath = "ckpts/hunyuan_video_custom_VAE_fp32.safetensors"
+        # elif avatar:
+        #     vae_configpath = "ckpts/config_vae_avatar.json"
+        #     vae_filepath = "ckpts/vae_avatar.pt"
+        else:
+            vae_configpath = "ckpts/hunyuan_video_VAE_config.json"
+            vae_filepath = "ckpts/hunyuan_video_VAE_fp32.safetensors"
+    # config = AutoencoderKLCausal3D.load_config("ckpts/hunyuan_video_VAE_config.json")
+    # config = AutoencoderKLCausal3D.load_config("c:/temp/hvae/config_vae.json")
+        vae, _, s_ratio, t_ratio = load_vae( "884-16c-hy", vae_path= vae_filepath, vae_config_path= vae_configpath, vae_precision= vae_precision, device= "cpu", )
+        vae._model_dtype =  torch.float32 if VAE_dtype == torch.float32 else  (torch.float16 if avatar else torch.bfloat16)
+        vae._model_dtype =  torch.float32 if VAE_dtype == torch.float32 else  torch.bfloat16
+        vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
+        enable_cfg = False
+        # Text encoder
+        if i2v_mode:
+            text_encoder = "llm-i2v"
+            tokenizer = "llm-i2v"
+            prompt_template = "dit-llm-encode-i2v"
+            prompt_template_video = "dit-llm-encode-video-i2v"
+        elif custom or avatar :
+            text_encoder = "llm-i2v"
+            tokenizer = "llm-i2v"
+            prompt_template = "dit-llm-encode"
+            prompt_template_video = "dit-llm-encode-video"
+            enable_cfg = True
+        else:
+            text_encoder = "llm"
+            tokenizer = "llm"
+            prompt_template = "dit-llm-encode"
+            prompt_template_video = "dit-llm-encode-video"
+        if prompt_template_video is not None:
+            crop_start = PROMPT_TEMPLATE[prompt_template_video].get( "crop_start", 0 )
+        elif prompt_template is not None:
+            crop_start = PROMPT_TEMPLATE[prompt_template].get("crop_start", 0)
+        else:
+            crop_start = 0
+        max_length = text_len + crop_start
+        # prompt_template
+        prompt_template =  PROMPT_TEMPLATE[prompt_template] if prompt_template is not None else None
+        # prompt_template_video
+        prompt_template_video = PROMPT_TEMPLATE[prompt_template_video] if prompt_template_video is not None else None
+        text_encoder = TextEncoder(
+            text_encoder_type=text_encoder,
+            max_length=max_length,
+            text_encoder_precision="fp16",
+            tokenizer_type=tokenizer,
+            i2v_mode=i2v_mode,
+            prompt_template=prompt_template,
+            prompt_template_video=prompt_template_video,
+            hidden_state_skip_layer=2,
+            apply_final_norm=False,
+            reproduce=True,
+            device="cpu",
+            image_embed_interleave=image_embed_interleave,
+   			text_encoder_path = text_encoder_filepath
+        )
+        text_encoder_2 = TextEncoder(
+            text_encoder_type="clipL",
+            max_length=77,
+            text_encoder_precision="fp16",
+            tokenizer_type="clipL",
+            reproduce=True,
+            device="cpu",
+        )
+        feature_extractor = None
+        wav2vec = None
+        align_instance = None
+        if avatar or custom_audio:
+            feature_extractor = AutoFeatureExtractor.from_pretrained("ckpts/whisper-tiny/")
+            wav2vec = WhisperModel.from_pretrained("ckpts/whisper-tiny/").to(device="cpu", dtype=torch.float32)
+            wav2vec._model_dtype = torch.float32
+            wav2vec.requires_grad_(False)
+        if avatar:
+            align_instance = AlignImage("cuda", det_path="ckpts/det_align/detface.pt")
+            align_instance.facedet.model.to("cpu")
+            adapt_model(model, "audio_adapter_blocks")
+        elif custom_audio:
+            adapt_model(model, "audio_models")
+        return cls(
+            i2v=i2v_mode,
+            custom=custom,
+            avatar=avatar,
+            enable_cfg = enable_cfg,
+            vae=vae,
+            vae_kwargs=vae_kwargs,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            model=model,
+            feature_extractor=feature_extractor,
+            wav2vec=wav2vec,
+            align_instance=align_instance,
+            device=device,
+        )
+class HunyuanVideoSampler(Inference):
+    def __init__(
+        self,
+        i2v,
+        custom,
+        avatar,
+        enable_cfg,
+        vae,
+        vae_kwargs,
+        text_encoder,
+        model,
+        text_encoder_2=None,
+        pipeline=None,
+        feature_extractor=None,
+        wav2vec=None,
+        align_instance=None,
+        device=0,
+    ):
+        super().__init__(
+            i2v,
+            custom,
+            avatar,
+            enable_cfg,
+            vae,
+            vae_kwargs,
+            text_encoder,
+            model,
+            text_encoder_2=text_encoder_2,
+            pipeline=pipeline,
+            feature_extractor=feature_extractor,
+            wav2vec=wav2vec,
+            align_instance=align_instance,
+            device=device,
+        )
+        self.i2v_mode = i2v
+        self.enable_cfg = enable_cfg
+        self.pipeline = self.load_diffusion_pipeline(
+            avatar = self.avatar,
+            vae=self.vae,
+            text_encoder=self.text_encoder,
+            text_encoder_2=self.text_encoder_2,
+            model=self.model,
+            device=self.device,
+        )
+        if self.i2v_mode:
+            self.default_negative_prompt = NEGATIVE_PROMPT_I2V
+        else:
+            self.default_negative_prompt = NEGATIVE_PROMPT
+    @property
+    def _interrupt(self):
+        return self.pipeline._interrupt
+    @_interrupt.setter
+    def _interrupt(self, value):
+        self.pipeline._interrupt =value
+    def load_diffusion_pipeline(
+        self,
+        avatar,
+        vae,
+        text_encoder,
+        text_encoder_2,
+        model,
+        scheduler=None,
+        device=None,
+        progress_bar_config=None,
+        #data_type="video",
+    ):
+        """Load the denoising scheduler for inference."""
+        if scheduler is None:
+            scheduler = FlowMatchDiscreteScheduler(
+                shift=6.0,
+                reverse=True,
+                solver="euler",
+            )
+        if avatar:
+            pipeline = HunyuanVideoAudioPipeline(
+                vae=vae,
+                text_encoder=text_encoder,
+                text_encoder_2=text_encoder_2,
+                transformer=model,
+                scheduler=scheduler,
+                progress_bar_config=progress_bar_config,
+            )
+        else:
+            pipeline = HunyuanVideoPipeline(
+                vae=vae,
+                text_encoder=text_encoder,
+                text_encoder_2=text_encoder_2,
+                transformer=model,
+                scheduler=scheduler,
+                progress_bar_config=progress_bar_config,
+            )
+        return pipeline
+    def get_rotary_pos_embed_new(self, video_length, height, width, concat_dict={}, enable_riflex = False):
+        target_ndim = 3
+        ndim = 5 - 2
+        latents_size = [(video_length-1)//4+1 , height//8, width//8]
+        if isinstance(self.model.patch_size, int):
+            assert all(s % self.model.patch_size == 0 for s in latents_size), \
+                f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), " \
+                f"but got {latents_size}."
+            rope_sizes = [s // self.model.patch_size for s in latents_size]
+        elif isinstance(self.model.patch_size, list):
+            assert all(s % self.model.patch_size[idx] == 0 for idx, s in enumerate(latents_size)), \
+                f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), " \
+                f"but got {latents_size}."
+            rope_sizes = [s // self.model.patch_size[idx] for idx, s in enumerate(latents_size)]
+        if len(rope_sizes) != target_ndim:
+            rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes  # time axis
+        head_dim = self.model.hidden_size // self.model.heads_num
+        rope_dim_list = self.model.rope_dim_list
+        if rope_dim_list is None:
+            rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+        assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+        freqs_cos, freqs_sin = get_nd_rotary_pos_embed_new(rope_dim_list,
+                                                    rope_sizes,
+                                                    theta=256,
+                                                    use_real=True,
+                                                    theta_rescale_factor=1,
+                                                    concat_dict=concat_dict,
+                                                    L_test = (video_length - 1) // 4 + 1,
+                                                    enable_riflex = enable_riflex
+                                                    )
+        return freqs_cos, freqs_sin
+    def get_rotary_pos_embed(self, video_length, height, width, enable_riflex = False):
+        target_ndim = 3
+        ndim = 5 - 2
+        # 884
+        vae = "884-16c-hy"
+        if "884" in vae:
+            latents_size = [(video_length - 1) // 4 + 1, height // 8, width // 8]
+        elif "888" in vae:
+            latents_size = [(video_length - 1) // 8 + 1, height // 8, width // 8]
+        else:
+            latents_size = [video_length, height // 8, width // 8]
+        if isinstance(self.model.patch_size, int):
+            assert all(s % self.model.patch_size == 0 for s in latents_size), (
+                f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), "
+                f"but got {latents_size}."
+            )
+            rope_sizes = [s // self.model.patch_size for s in latents_size]
+        elif isinstance(self.model.patch_size, list):
+            assert all(
+                s % self.model.patch_size[idx] == 0
+                for idx, s in enumerate(latents_size)
+            ), (
+                f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), "
+                f"but got {latents_size}."
+            )
+            rope_sizes = [
+                s // self.model.patch_size[idx] for idx, s in enumerate(latents_size)
+            ]
+        if len(rope_sizes) != target_ndim:
+            rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes  # time axis
+        head_dim = self.model.hidden_size // self.model.heads_num
+        rope_dim_list = self.model.rope_dim_list
+        if rope_dim_list is None:
+            rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+        assert (
+            sum(rope_dim_list) == head_dim
+        ), "sum(rope_dim_list) should equal to head_dim of attention layer"
+        freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
+            rope_dim_list,
+            rope_sizes,
+            theta=256,
+            use_real=True,
+            theta_rescale_factor=1,
+            L_test = (video_length - 1) // 4 + 1,
+            enable_riflex = enable_riflex
+        )
+        return freqs_cos, freqs_sin
+    def generate(
+        self,
+        input_prompt,
+        input_ref_images = None,
+        audio_guide = None,
+        input_frames = None,
+        input_masks = None,
+        input_video = None,
+        fps = 24,
+        height=192,
+        width=336,
+        frame_num=129,
+        seed=None,
+        n_prompt=None,
+        sampling_steps=50,
+        guide_scale=1.0,
+        shift=5.0,
+        embedded_guidance_scale=6.0,
+        batch_size=1,
+        num_videos_per_prompt=1,
+        i2v_resolution="720p",
+        image_start=None,
+        enable_RIFLEx = False,
+        i2v_condition_type: str = "token_replace",
+        i2v_stability=True,
+        VAE_tile_size = None,
+        joint_pass = False,
+        cfg_star_switch = False,
+        fit_into_canvas = True,
+        conditioning_latents_size = 0,
+        **kwargs,
+    ):
+        if VAE_tile_size != None:
+            self.vae.tile_sample_min_tsize = VAE_tile_size["tile_sample_min_tsize"]
+            self.vae.tile_latent_min_tsize = VAE_tile_size["tile_latent_min_tsize"]
+            self.vae.tile_sample_min_size = VAE_tile_size["tile_sample_min_size"]
+            self.vae.tile_latent_min_size = VAE_tile_size["tile_latent_min_size"]
+            self.vae.tile_overlap_factor = VAE_tile_size["tile_overlap_factor"]
+            self.vae.enable_tiling()
+        i2v_mode= self.i2v_mode
+        if not self.enable_cfg:
+            guide_scale=1.0
+        # ========================================================================
+        # Arguments: seed
+        # ========================================================================
+        if isinstance(seed, torch.Tensor):
+            seed = seed.tolist()
+        if seed is None:
+            seeds = [
+                random.randint(0, 1_000_000)
+                for _ in range(batch_size * num_videos_per_prompt)
+            ]
+        elif isinstance(seed, int):
+            seeds = [
+                seed + i
+                for _ in range(batch_size)
+                for i in range(num_videos_per_prompt)
+            ]
+        elif isinstance(seed, (list, tuple)):
+            if len(seed) == batch_size:
+                seeds = [
+                    int(seed[i]) + j
+                    for i in range(batch_size)
+                    for j in range(num_videos_per_prompt)
+                ]
+            elif len(seed) == batch_size * num_videos_per_prompt:
+                seeds = [int(s) for s in seed]
+            else:
+                raise ValueError(
+                    f"Length of seed must be equal to number of prompt(batch_size) or "
+                    f"batch_size * num_videos_per_prompt ({batch_size} * {num_videos_per_prompt}), got {seed}."
+                )
+        else:
+            raise ValueError(
+                f"Seed must be an integer, a list of integers, or None, got {seed}."
+            )
+        from wan.utils.utils import seed_everything
+        seed_everything(seed)
+        generator = [torch.Generator("cuda").manual_seed(seed) for seed in seeds]
+        # generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds]
+        # ========================================================================
+        # Arguments: target_width, target_height, target_frame_num
+        # ========================================================================
+        if width <= 0 or height <= 0 or frame_num <= 0:
+            raise ValueError(
+                f"`height` and `width` and `frame_num` must be positive integers, got height={height}, width={width}, frame_num={frame_num}"
+            )
+        if (frame_num - 1) % 4 != 0:
+            raise ValueError(
+                f"`frame_num-1` must be a multiple of 4, got {frame_num}"
+            )
+        target_height = align_to(height, 16)
+        target_width = align_to(width, 16)
+        target_frame_num = frame_num
+        audio_strength = 1
+        if input_ref_images  != None:
+            # ip_cfg_scale = 3.0
+            ip_cfg_scale = 0
+            denoise_strength = 1
+            # guide_scale=7.5
+            # shift=13
+            name = "person"
+            input_ref_images = input_ref_images[0]
+        # ========================================================================
+        # Arguments: prompt, new_prompt, negative_prompt
+        # ========================================================================
+        if not isinstance(input_prompt, str):
+            raise TypeError(f"`prompt` must be a string, but got {type(input_prompt)}")
+        input_prompt = [input_prompt.strip()]
+        # negative prompt
+        if n_prompt is None or n_prompt == "":
+            n_prompt = self.default_negative_prompt
+        if guide_scale == 1.0:
+            n_prompt = ""
+        if not isinstance(n_prompt, str):
+            raise TypeError(
+                f"`negative_prompt` must be a string, but got {type(n_prompt)}"
+            )
+        n_prompt = [n_prompt.strip()]
+        # ========================================================================
+        # Scheduler
+        # ========================================================================
+        scheduler = FlowMatchDiscreteScheduler(
+            shift=shift,
+            reverse=True,
+            solver="euler"
+        )
+        self.pipeline.scheduler = scheduler
+        # ---------------------------------
+        # Reference condition
+        # ---------------------------------
+        img_latents = None
+        semantic_images = None
+        denoise_strength = 0
+        ip_cfg_scale = 0
+        if i2v_mode:
+            if i2v_resolution == "720p":
+                bucket_hw_base_size = 960
+            elif i2v_resolution == "540p":
+                bucket_hw_base_size = 720
+            elif i2v_resolution == "360p":
+                bucket_hw_base_size = 480
+            else:
+                raise ValueError(f"i2v_resolution: {i2v_resolution} must be in [360p, 540p, 720p]")
+            # semantic_images = [Image.open(i2v_image_path).convert('RGB')]
+            semantic_images = [image_start.convert('RGB')] #
+            origin_size = semantic_images[0].size
+            h, w = origin_size
+            h, w = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
+            closest_size = (w, h)
+            # crop_size_list = generate_crop_size_list(bucket_hw_base_size, 32)
+            # aspect_ratios = np.array([round(float(h)/float(w), 5) for h, w in crop_size_list])
+            # closest_size, closest_ratio = get_closest_ratio(origin_size[1], origin_size[0], aspect_ratios, crop_size_list)
+            ref_image_transform = transforms.Compose([
+                transforms.Resize(closest_size),
+                transforms.CenterCrop(closest_size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5])
+            ])
+            semantic_image_pixel_values = [ref_image_transform(semantic_image) for semantic_image in semantic_images]
+            semantic_image_pixel_values = torch.cat(semantic_image_pixel_values).unsqueeze(0).unsqueeze(2).to(self.device)
+            with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+                img_latents = self.pipeline.vae.encode(semantic_image_pixel_values).latent_dist.mode() # B, C, F, H, W
+                img_latents.mul_(self.pipeline.vae.config.scaling_factor)
+            target_height, target_width = closest_size
+        # ========================================================================
+        # Build Rope freqs
+        # ========================================================================
+        if input_ref_images == None:
+            freqs_cos, freqs_sin = self.get_rotary_pos_embed(target_frame_num, target_height, target_width, enable_RIFLEx)
+        else:
+            if self.avatar:
+                w, h = input_ref_images.size
+                target_height, target_width = calculate_new_dimensions(target_height, target_width, h, w, fit_into_canvas)
+                if target_width != w or target_height != h:
+                    input_ref_images = input_ref_images.resize((target_width,target_height), resample=Image.Resampling.LANCZOS)
+                concat_dict = {'mode': 'timecat', 'bias': -1}
+                freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(129, target_height, target_width, concat_dict)
+            else:
+                if input_frames != None:
+                    target_height, target_width = input_frames.shape[-3:-1]
+                elif input_video != None:
+                    target_height, target_width = input_video.shape[-2:]
+                concat_dict = {'mode': 'timecat-w', 'bias': -1}
+                freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict, enable_RIFLEx)
+        n_tokens = freqs_cos.shape[0]
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        # ========================================================================
+        # Pipeline inference
+        # ========================================================================
+        pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref =  None, None, None
+        if input_ref_images  == None:
+            name = None
+        else:
+            pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref =  DataPreprocess().get_batch(input_ref_images, (target_width, target_height), pad = self.custom)
+        ref_latents, uncond_audio_prompts, audio_prompts, face_masks, motion_exp, motion_pose = None, None, None, None, None, None
+        bg_latents = None
+        if input_video != None:
+            pixel_value_bg = input_video.unsqueeze(0)
+            pixel_value_mask =  torch.zeros_like(input_video).unsqueeze(0)
+        if input_frames != None:
+            pixel_value_video_bg = input_frames.permute(-1,0,1,2).unsqueeze(0).float()
+            pixel_value_video_mask = input_masks.unsqueeze(-1).repeat(1,1,1,3).permute(-1,0,1,2).unsqueeze(0).float()
+            pixel_value_video_bg = pixel_value_video_bg.div_(127.5).add_(-1.)
+            if input_video != None:
+                pixel_value_bg = torch.cat([pixel_value_bg, pixel_value_video_bg], dim=2)
+                pixel_value_mask = torch.cat([ pixel_value_mask, pixel_value_video_mask], dim=2)
+            else:
+                pixel_value_bg = pixel_value_video_bg
+                pixel_value_mask = pixel_value_video_mask
+            pixel_value_video_mask, pixel_value_video_bg  = None, None
+        if input_video != None or input_frames != None:
+            if pixel_value_bg.shape[2] < frame_num:
+                padding_shape = list(pixel_value_bg.shape[0:2]) + [frame_num-pixel_value_bg.shape[2]] +  list(pixel_value_bg.shape[3:])
+                pixel_value_bg = torch.cat([pixel_value_bg, torch.full(padding_shape, -1, dtype=pixel_value_bg.dtype, device= pixel_value_bg.device ) ], dim=2)
+                pixel_value_mask = torch.cat([ pixel_value_mask, torch.full(padding_shape, 255, dtype=pixel_value_mask.dtype, device= pixel_value_mask.device ) ], dim=2)
+            bg_latents = self.vae.encode(pixel_value_bg).latent_dist.sample()
+            pixel_value_mask = pixel_value_mask.div_(127.5).add_(-1.)
+            mask_latents = self.vae.encode(pixel_value_mask).latent_dist.sample()
+            bg_latents = torch.cat([bg_latents, mask_latents], dim=1)
+            bg_latents.mul_(self.vae.config.scaling_factor)
+        if self.avatar:
+            if n_prompt == None or len(n_prompt) == 0:
+                n_prompt = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, Lens changes"
+            uncond_pixel_value_llava = pixel_value_llava.clone()
+            pixel_value_ref = pixel_value_ref.unsqueeze(0)
+            self.align_instance.facedet.model.to("cuda")
+            face_masks = get_facemask(pixel_value_ref.to("cuda")*255, self.align_instance, area=3.0)
+            # iii = (face_masks.squeeze(0).squeeze(0).permute(1,2,0).repeat(1,1,3)*255).cpu().numpy().astype(np.uint8)
+            # image = Image.fromarray(iii)
+            # image.save("mask.png")
+            # jjj = (pixel_value_ref.squeeze(0).squeeze(0).permute(1,2,0)*255).cpu().numpy().astype(np.uint8)
+            self.align_instance.facedet.model.to("cpu")
+            # pixel_value_ref = pixel_value_ref.clone().repeat(1,129,1,1,1)
+            pixel_value_ref = pixel_value_ref.repeat(1,1+4*2,1,1,1)
+            pixel_value_ref = pixel_value_ref * 2 - 1
+            pixel_value_ref_for_vae = rearrange(pixel_value_ref, "b f c h w -> b c f h w")
+            vae_dtype = self.vae.dtype
+            with torch.autocast(device_type="cuda", dtype=vae_dtype, enabled=vae_dtype != torch.float32):
+                ref_latents = self.vae.encode(pixel_value_ref_for_vae).latent_dist.sample()
+                ref_latents = torch.cat( [ref_latents[:,:, :1], ref_latents[:,:, 1:2].repeat(1,1,31,1,1),  ref_latents[:,:, -1:]], dim=2)
+                pixel_value_ref, pixel_value_ref_for_vae = None, None
+                if hasattr(self.vae.config, 'shift_factor') and self.vae.config.shift_factor:
+                    ref_latents.sub_(self.vae.config.shift_factor).mul_(self.vae.config.scaling_factor)
+                else:
+                    ref_latents.mul_(self.vae.config.scaling_factor)
+                # out_latents= ref_latents / self.vae.config.scaling_factor
+                # image = self.vae.decode(out_latents, return_dict=False, generator=generator)[0]
+                # image = image.clamp(-1, 1)
+                # from wan.utils.utils import cache_video
+                # cache_video( tensor=image, save_file="decode.mp4", fps=25, nrow=1, normalize=True, value_range=(-1, 1))
+            motion_pose = np.array([25] * 4)
+            motion_exp = np.array([30] * 4)
+            motion_pose = torch.from_numpy(motion_pose).unsqueeze(0)
+            motion_exp = torch.from_numpy(motion_exp).unsqueeze(0)
+            face_masks = torch.nn.functional.interpolate(face_masks.float().squeeze(2),
+                                                    (ref_latents.shape[-2],
+                                                    ref_latents.shape[-1]),
+                                                    mode="bilinear").unsqueeze(2).to(dtype=ref_latents.dtype)
+        if audio_guide != None:
+            audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_guide, duration = frame_num/fps )
+            audio_prompts = audio_input[0]
+            weight_dtype = audio_prompts.dtype
+            if self.custom:
+                audio_len = min(audio_len, frame_num)
+                audio_input = audio_input[:, :audio_len]
+            audio_prompts = encode_audio(self.wav2vec, audio_prompts.to(dtype=self.wav2vec.dtype), fps, num_frames=audio_len)
+            audio_prompts = audio_prompts.to(self.model.dtype)
+            segment_size = 129 if self.avatar else frame_num
+            if audio_prompts.shape[1] <= segment_size:
+                audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1,segment_size-audio_prompts.shape[1], 1, 1, 1)], dim=1)
+            else:
+                audio_prompts = torch.cat([audio_prompts, torch.zeros_like(audio_prompts[:, :1]).repeat(1, 5, 1, 1, 1)], dim=1)
+            uncond_audio_prompts = torch.zeros_like(audio_prompts[:,:129])
+        samples = self.pipeline(
+            prompt=input_prompt,
+            height=target_height,
+            width=target_width,
+            video_length=target_frame_num,
+            num_inference_steps=sampling_steps,
+            guidance_scale=guide_scale,
+            negative_prompt=n_prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            generator=generator,
+            output_type="pil",
+            name = name,
+            pixel_value_ref = pixel_value_ref,
+            ref_latents=ref_latents,                            # [1, 16, 1, h//8, w//8]
+            pixel_value_llava=pixel_value_llava,                # [1, 3, 336, 336]
+            uncond_pixel_value_llava=uncond_pixel_value_llava,
+            face_masks=face_masks,                              # [b f h w]
+            audio_prompts=audio_prompts,
+            uncond_audio_prompts=uncond_audio_prompts,
+            motion_exp=motion_exp,
+            motion_pose=motion_pose,
+            fps= torch.from_numpy(np.array(fps)),
+            bg_latents = bg_latents,
+            audio_strength = audio_strength,
+            denoise_strength=denoise_strength,
+            ip_cfg_scale=ip_cfg_scale,
+            freqs_cis=(freqs_cos, freqs_sin),
+            n_tokens=n_tokens,
+            embedded_guidance_scale=embedded_guidance_scale,
+            data_type="video" if target_frame_num > 1 else "image",
+            is_progress_bar=True,
+            vae_ver="884-16c-hy",
+            enable_tiling=True,
+            i2v_mode=i2v_mode,
+            i2v_condition_type=i2v_condition_type,
+            i2v_stability=i2v_stability,
+            img_latents=img_latents,
+            semantic_images=semantic_images,
+            joint_pass = joint_pass,
+            cfg_star_rescale = cfg_star_switch,
+            callback = callback,
+            callback_steps = callback_steps,
+        )[0]
+        if samples == None:
+            return None
+        samples = samples.squeeze(0)
+        return samples

hyvideo/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from .models import HYVideoDiffusionTransformer, HUNYUAN_VIDEO_CONFIG
+def load_model(model, i2v_condition_type, in_channels, out_channels, factor_kwargs):
+    """load hunyuan video model
+    Args:
+        args (dict): model args
+        in_channels (int): input channels number
+        out_channels (int): output channels number
+        factor_kwargs (dict): factor kwargs
+    Returns:
+        model (nn.Module): The hunyuan video model
+    """
+    if model in HUNYUAN_VIDEO_CONFIG.keys():
+        model = HYVideoDiffusionTransformer(
+            i2v_condition_type = i2v_condition_type,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            **HUNYUAN_VIDEO_CONFIG[model],
+            **factor_kwargs,
+        )
+        return model
+    else:
+        raise NotImplementedError()

hyvideo/modules/activation_layers.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch.nn as nn
+def get_activation_layer(act_type):
+    """get activation layer
+    Args:
+        act_type (str): the activation type
+    Returns:
+        torch.nn.functional: the activation layer
+    """
+    if act_type == "gelu":
+        return lambda: nn.GELU()
+    elif act_type == "gelu_tanh":
+        # Approximate `tanh` requires torch >= 1.13
+        return lambda: nn.GELU(approximate="tanh")
+    elif act_type == "relu":
+        return nn.ReLU
+    elif act_type == "silu":
+        return nn.SiLU
+    else:
+        raise ValueError(f"Unknown activation type: {act_type}")

hyvideo/modules/attenion.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import importlib.metadata
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from importlib.metadata import version
+def clear_list(l):
+    for i in range(len(l)):
+        l[i] = None
+try:
+    import flash_attn
+    from flash_attn.flash_attn_interface import _flash_attn_forward
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+except ImportError:
+    flash_attn = None
+    flash_attn_varlen_func = None
+    _flash_attn_forward = None
+try:
+    from xformers.ops import memory_efficient_attention
+except ImportError:
+    memory_efficient_attention = None
+try:
+    from sageattention import sageattn_varlen
+    def sageattn_varlen_wrapper(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            max_seqlen_q,
+            max_seqlen_kv,
+        ):
+        return sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+except ImportError:
+    sageattn_varlen_wrapper = None
+try:
+    from sageattention import sageattn
+    @torch.compiler.disable()
+    def sageattn_wrapper(
+            qkv_list,
+            attention_length
+        ):
+        q,k, v = qkv_list
+        padding_length = q.shape[1] -attention_length
+        q = q[:, :attention_length, :, : ]
+        k = k[:, :attention_length, :, : ]
+        v = v[:, :attention_length, :, : ]
+        o = sageattn(q, k, v, tensor_layout="NHD")
+        del q, k ,v
+        clear_list(qkv_list)
+        if padding_length > 0:
+            o = torch.cat([o, torch.empty( (o.shape[0], padding_length, *o.shape[-2:]), dtype= o.dtype, device=o.device  ) ], 1)
+        return o
+except ImportError:
+    sageattn = None
+def get_attention_modes():
+    ret = ["sdpa", "auto"]
+    if flash_attn != None:
+        ret.append("flash")
+    if memory_efficient_attention != None:
+        ret.append("xformers")
+    if sageattn_varlen_wrapper != None:
+        ret.append("sage")
+    if sageattn != None and version("sageattention").startswith("2") :
+        ret.append("sage2")
+    return ret
+MEMORY_LAYOUT = {
+    "sdpa": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "xformers": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "sage2": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "sage": (
+        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+        lambda x: x,
+    ),
+    "flash": (
+        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+        lambda x: x,
+    ),
+    "torch": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "vanilla": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+}
+@torch.compiler.disable()
+def sdpa_wrapper(
+        qkv_list,
+        attention_length
+    ):
+    q,k, v = qkv_list
+    padding_length = q.shape[2] -attention_length
+    q = q[:, :, :attention_length, :]
+    k = k[:, :, :attention_length, :]
+    v = v[:, :, :attention_length, :]
+    o = F.scaled_dot_product_attention(
+        q, k, v, attn_mask=None, is_causal=False
+    )
+    del q, k ,v
+    clear_list(qkv_list)
+    if padding_length > 0:
+        o = torch.cat([o, torch.empty( (*o.shape[:2], padding_length, o.shape[-1]), dtype= o.dtype, device=o.device  ) ], 2)
+    return o
+def get_cu_seqlens(text_mask, img_len):
+    """Calculate cu_seqlens_q, cu_seqlens_kv using text_mask and img_len
+    Args:
+        text_mask (torch.Tensor): the mask of text
+        img_len (int): the length of image
+    Returns:
+        torch.Tensor: the calculated cu_seqlens for flash attention
+    """
+    batch_size = text_mask.shape[0]
+    text_len = text_mask.sum(dim=1)
+    max_len = text_mask.shape[1] + img_len
+    cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
+    for i in range(batch_size):
+        s = text_len[i] + img_len
+        s1 = i * max_len + s
+        s2 = (i + 1) * max_len
+        cu_seqlens[2 * i + 1] = s1
+        cu_seqlens[2 * i + 2] = s2
+    return cu_seqlens
+def attention(
+    qkv_list,
+    mode="flash",
+    drop_rate=0,
+    attn_mask=None,
+    causal=False,
+    cu_seqlens_q=None,
+    cu_seqlens_kv=None,
+    max_seqlen_q=None,
+    max_seqlen_kv=None,
+    batch_size=1,
+):
+    """
+    Perform QKV self attention.
+    Args:
+        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
+        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
+        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
+        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
+        drop_rate (float): Dropout rate in attention map. (default: 0)
+        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
+            (default: None)
+        causal (bool): Whether to use causal attention. (default: False)
+        cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into q.
+        cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into kv.
+        max_seqlen_q (int): The maximum sequence length in the batch of q.
+        max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
+    Returns:
+        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
+    """
+    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
+    q , k , v = qkv_list
+    clear_list(qkv_list)
+    del qkv_list
+    padding_length = 0
+    # if attn_mask == None and mode == "sdpa":
+    #     padding_length  = q.shape[1] - cu_seqlens_q
+    #     q = q[:, :cu_seqlens_q, ... ]
+    #     k = k[:, :cu_seqlens_kv, ... ]
+    #     v = v[:, :cu_seqlens_kv, ... ]
+    q = pre_attn_layout(q)
+    k = pre_attn_layout(k)
+    v = pre_attn_layout(v)
+    if mode == "torch":
+        if attn_mask is not None and attn_mask.dtype != torch.bool:
+            attn_mask = attn_mask.to(q.dtype)
+        x = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
+        )
+    elif mode == "sdpa":
+        # if attn_mask is not None and attn_mask.dtype != torch.bool:
+        #     attn_mask = attn_mask.to(q.dtype)
+        # x = F.scaled_dot_product_attention(
+        #     q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
+        # )
+        assert attn_mask==None
+        qkv_list = [q, k, v]
+        del q, k , v
+        x = sdpa_wrapper( qkv_list, cu_seqlens_q )
+    elif mode == "xformers":
+        x = memory_efficient_attention(
+            q, k, v , attn_bias= attn_mask
+        )
+    elif mode == "sage2":
+        qkv_list = [q, k, v]
+        del q, k , v
+        x = sageattn_wrapper(qkv_list, cu_seqlens_q)
+    elif mode == "sage":
+        x = sageattn_varlen_wrapper(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            max_seqlen_q,
+            max_seqlen_kv,
+        )
+        # x with shape [(bxs), a, d]
+        x = x.view(
+            batch_size, max_seqlen_q, x.shape[-2], x.shape[-1]
+        )  # reshape x to [b, s, a, d]
+    elif mode == "flash":
+        x = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            max_seqlen_q,
+            max_seqlen_kv,
+        )
+        # x with shape [(bxs), a, d]
+        x = x.view(
+            batch_size, max_seqlen_q, x.shape[-2], x.shape[-1]
+        )  # reshape x to [b, s, a, d]
+    elif mode == "vanilla":
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        b, a, s, _ = q.shape
+        s1 = k.size(2)
+        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
+        if causal:
+            # Only applied to self attention
+            assert (
+                attn_mask is None
+            ), "Causal mask and attn_mask cannot be used together"
+            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(
+                diagonal=0
+            )
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(q.dtype)
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        # TODO: Maybe force q and k to be float32 to avoid numerical overflow
+        attn = (q @ k.transpose(-2, -1)) * scale_factor
+        attn += attn_bias
+        attn = attn.softmax(dim=-1)
+        attn = torch.dropout(attn, p=drop_rate, train=True)
+        x = attn @ v
+    else:
+        raise NotImplementedError(f"Unsupported attention mode: {mode}")
+    x = post_attn_layout(x)
+    b, s, a, d = x.shape
+    out = x.reshape(b, s, -1)
+    if padding_length > 0 :
+        out = torch.cat([out, torch.empty( (out.shape[0], padding_length, out.shape[2]), dtype= out.dtype, device=out.device  ) ], 1)
+    return out
+def parallel_attention(
+    hybrid_seq_parallel_attn,
+    q,
+    k,
+    v,
+    img_q_len,
+    img_kv_len,
+    cu_seqlens_q,
+    cu_seqlens_kv
+):
+    attn1 = hybrid_seq_parallel_attn(
+        None,
+        q[:, :img_q_len, :, :],
+        k[:, :img_kv_len, :, :],
+        v[:, :img_kv_len, :, :],
+        dropout_p=0.0,
+        causal=False,
+        joint_tensor_query=q[:,img_q_len:cu_seqlens_q[1]],
+        joint_tensor_key=k[:,img_kv_len:cu_seqlens_kv[1]],
+        joint_tensor_value=v[:,img_kv_len:cu_seqlens_kv[1]],
+        joint_strategy="rear",
+    )
+    if flash_attn.__version__ >= '2.7.0':
+        attn2, *_ = _flash_attn_forward(
+            q[:,cu_seqlens_q[1]:],
+            k[:,cu_seqlens_kv[1]:],
+            v[:,cu_seqlens_kv[1]:],
+            dropout_p=0.0,
+            softmax_scale=q.shape[-1] ** (-0.5),
+            causal=False,
+            window_size_left=-1,
+            window_size_right=-1,
+            softcap=0.0,
+            alibi_slopes=None,
+            return_softmax=False,
+        )
+    else:
+        attn2, *_ = _flash_attn_forward(
+            q[:,cu_seqlens_q[1]:],
+            k[:,cu_seqlens_kv[1]:],
+            v[:,cu_seqlens_kv[1]:],
+            dropout_p=0.0,
+            softmax_scale=q.shape[-1] ** (-0.5),
+            causal=False,
+            window_size=(-1, -1),
+            softcap=0.0,
+            alibi_slopes=None,
+            return_softmax=False,
+        )
+    attn = torch.cat([attn1, attn2], dim=1)
+    b, s, a, d = attn.shape
+    attn = attn.reshape(b, s, -1)
+    return attn

hyvideo/modules/audio_adapters.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+This module provides the implementation of an Audio Projection Model, which is designed for
+audio processing tasks. The model takes audio embeddings as input and outputs context tokens
+that can be used for various downstream applications, such as audio analysis or synthesis.
+The AudioProjModel class is based on the ModelMixin class from the diffusers library, which
+provides a foundation for building custom models. This implementation includes multiple linear
+layers with ReLU activation functions and a LayerNorm for normalization.
+Key Features:
+- Audio embedding input with flexible sequence length and block structure.
+- Multiple linear layers for feature transformation.
+- ReLU activation for non-linear transformation.
+- LayerNorm for stabilizing and speeding up training.
+- Rearrangement of input embeddings to match the model's expected input shape.
+- Customizable number of blocks, channels, and context tokens for adaptability.
+The module is structured to be easily integrated into larger systems or used as a standalone
+component for audio feature extraction and processing.
+Classes:
+- AudioProjModel: A class representing the audio projection model with configurable parameters.
+Functions:
+- (none)
+Dependencies:
+- torch: For tensor operations and neural network components.
+- diffusers: For the ModelMixin base class.
+- einops: For tensor rearrangement operations.
+"""
+import torch
+from diffusers import ModelMixin
+from einops import rearrange
+import math
+import torch.nn as nn
+class AudioProjNet2(ModelMixin):
+    """Audio Projection Model
+    This class defines an audio projection model that takes audio embeddings as input
+    and produces context tokens as output. The model is based on the ModelMixin class
+    and consists of multiple linear layers and activation functions. It can be used
+    for various audio processing tasks.
+    Attributes:
+        seq_len (int): The length of the audio sequence.
+        blocks (int): The number of blocks in the audio projection model.
+        channels (int): The number of channels in the audio projection model.
+        intermediate_dim (int): The intermediate dimension of the model.
+        context_tokens (int): The number of context tokens in the output.
+        output_dim (int): The output dimension of the context tokens.
+    Methods:
+        __init__(self, seq_len=5, blocks=12, channels=768, intermediate_dim=512, context_tokens=32, output_dim=768):
+            Initializes the AudioProjModel with the given parameters.
+        forward(self, audio_embeds):
+            Defines the forward pass for the AudioProjModel.
+            Parameters:
+            audio_embeds (torch.Tensor): The input audio embeddings with shape (batch_size, video_length, blocks, channels).
+            Returns:
+            context_tokens (torch.Tensor): The output context tokens with shape (batch_size, video_length, context_tokens, output_dim).
+    """
+    def __init__(
+        self,
+        seq_len=5,
+        blocks=12,  # add a new parameter blocks
+        channels=768,  # add a new parameter channels
+        intermediate_dim=512,
+        output_dim=768,
+        context_tokens=4,
+    ):
+        super().__init__()
+        self.seq_len = seq_len
+        self.blocks = blocks
+        self.channels = channels
+        self.input_dim = (
+            seq_len * blocks * channels
+        )
+        self.intermediate_dim = intermediate_dim
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+        # define multiple linear layers
+        self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
+        self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
+        self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
+        self.norm = nn.LayerNorm(output_dim)
+    def forward(self, audio_embeds):
+        video_length = audio_embeds.shape[1]
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
+        audio_embeds = torch.relu(self.proj1(audio_embeds))
+        audio_embeds = torch.relu(self.proj2(audio_embeds))
+        context_tokens = self.proj3(audio_embeds).reshape(
+            batch_size, self.context_tokens, self.output_dim
+        )
+        context_tokens = self.norm(context_tokens)
+        out_all = rearrange(
+            context_tokens, "(bz f) m c -> bz f m c", f=video_length
+        )
+        return out_all
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttentionCA(nn.Module):
+    def __init__(self, *, dim=3072, dim_head=1024, heads=33):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head #* heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        import torch.nn.init as init
+        init.zeros_(self.to_out.weight)
+        if self.to_out.bias is not None:
+            init.zeros_(self.to_out.bias)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, t, aa, D)
+            latent (torch.Tensor): latent features
+                shape (b, t, hw, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        # print("latents shape: ", latents.shape)
+        # print("x shape: ", x.shape)
+        q = self.to_q(latents)
+        k, v = self.to_kv(x).chunk(2, dim=-1)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        # out = out.permute(0, 2, 1, 3)
+        return self.to_out(out)
+    #def forward(self, x, latents):
+    #    """
+    #    Args:
+    #        x (torch.Tensor): image features
+    #            shape (b, t, aa, D)
+    #        latent (torch.Tensor): latent features
+    #            shape (b, t, hw, D)
+    #    """
+    #    if get_sequence_parallel_state():
+    #        sp_size = nccl_info.sp_size
+    #        sp_rank = nccl_info.rank_within_group
+    #        print("rank:", latents.shape, sp_size, sp_rank)
+    #        latents = torch.chunk(latents, sp_size, dim=1)[sp_rank]
+    #    x = self.norm1(x)
+    #    latents = self.norm2(latents)
+    #    # print("latents shape: ", latents.shape)
+    #    # print("x shape: ", x.shape)
+    #    q = self.to_q(latents)
+    #    k, v = self.to_kv(x).chunk(2, dim=-1)
+    #    # print("q, k, v: ", q.shape, k.shape, v.shape)
+    #    # attention
+    #    #scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+    #    #weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+    #    #weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+    #    #out = weight @ v
+    #    def shrink_head(encoder_state, dim):
+    #        local_heads = encoder_state.shape[dim] // nccl_info.sp_size
+    #        return encoder_state.narrow(dim, nccl_info.rank_within_group * local_heads, local_heads)
+    #    if get_sequence_parallel_state():
+    #    # batch_size, seq_len, attn_heads, head_dim
+    #        q = all_to_all_4D(q, scatter_dim=2, gather_dim=1)  # [2, 32256, 24, 128]
+    #        k = shrink_head(k ,dim=2)
+    #        v = shrink_head(v ,dim=2)
+    #    qkv = torch.stack([query, key, value], dim=2)
+    #    attn = flash_attn_no_pad(qkv, causal=False, dropout_p=0.0, softmax_scale=None)
+    #    # out = out.permute(0, 2, 1, 3)
+    #    #b, s, a, d = attn.shape
+    #    #attn = attn.reshape(b, s, -1)
+    #
+    #    out = self.to_out(attn)
+    #    if get_sequence_parallel_state():
+    #        out = all_gather(out, dim=1)
+    #    return out

hyvideo/modules/embed_layers.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import math
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from ..utils.helpers import to_2tuple
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding
+    Image to Patch Embedding using Conv2d
+    A convolution based approach to patchifying a 2D image w/ embedding projection.
+    Based on the impl in https://github.com/google-research/vision_transformer
+    Hacked together by / Copyright 2020 Ross Wightman
+    Remove the _assert function in forward function to be compatible with multi-resolution images.
+    """
+    def __init__(
+        self,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.flatten = flatten
+        self.proj = nn.Conv3d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=bias,
+            **factory_kwargs
+        )
+        nn.init.xavier_uniform_(self.proj.weight.view(self.proj.weight.size(0), -1))
+        if bias:
+            nn.init.zeros_(self.proj.bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        x = self.proj(x)
+        shape = x.shape
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, shape
+class TextProjection(nn.Module):
+    """
+    Projects text embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            in_features=in_channels,
+            out_features=hidden_size,
+            bias=True,
+            **factory_kwargs
+        )
+        self.act_1 = act_layer()
+        self.linear_2 = nn.Linear(
+            in_features=hidden_size,
+            out_features=hidden_size,
+            bias=True,
+            **factory_kwargs
+        )
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    Args:
+        t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        dim (int): the dimension of the output.
+        max_period (int): controls the minimum frequency of the embeddings.
+    Returns:
+        embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+    .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        act_layer,
+        frequency_embedding_size=256,
+        max_period=10000,
+        out_size=None,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(
+                frequency_embedding_size, hidden_size, bias=True, **factory_kwargs
+            ),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.mlp[2].weight, std=0.02)
+    def forward(self, t):
+        t_freq = timestep_embedding(
+            t, self.frequency_embedding_size, self.max_period
+        ).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb

hyvideo/modules/mlp_layers.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Modified from timm library:
+# https://github.com/huggingface/pytorch-image-models/blob/648aaa41233ba83eb38faf5ba9d415d574823241/timm/layers/mlp.py#L13
+from functools import partial
+import torch
+import torch.nn as nn
+from .modulate_layers import modulate_
+from ..utils.helpers import to_2tuple
+class MLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_channels
+        hidden_channels = hidden_channels or in_channels
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(
+            in_channels, hidden_channels, bias=bias[0], **factory_kwargs
+        )
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_channels, **factory_kwargs)
+            if norm_layer is not None
+            else nn.Identity()
+        )
+        self.fc2 = linear_layer(
+            hidden_channels, out_features, bias=bias[1], **factory_kwargs
+        )
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+    def apply_(self, x, divide = 4):
+        x_shape = x.shape
+        x = x.view(-1, x.shape[-1])
+        chunk_size = int(x_shape[1]/divide)
+        x_chunks = torch.split(x, chunk_size)
+        for i, x_chunk  in enumerate(x_chunks):
+            mlp_chunk = self.fc1(x_chunk)
+            mlp_chunk = self.act(mlp_chunk)
+            mlp_chunk = self.drop1(mlp_chunk)
+            mlp_chunk = self.norm(mlp_chunk)
+            mlp_chunk = self.fc2(mlp_chunk)
+            x_chunk[...] = self.drop2(mlp_chunk)
+        return x
+#
+class MLPEmbedder(nn.Module):
+    """copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py"""
+    def __init__(self, in_dim: int, hidden_dim: int, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True, **factory_kwargs)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True, **factory_kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class FinalLayer(nn.Module):
+    """The final layer of DiT."""
+    def __init__(
+        self, hidden_size, patch_size, out_channels, act_layer, device=None, dtype=None
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        # Just use LayerNorm for the final layer
+        self.norm_final = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        if isinstance(patch_size, int):
+            self.linear = nn.Linear(
+                hidden_size,
+                patch_size * patch_size * out_channels,
+                bias=True,
+                **factory_kwargs
+            )
+        else:
+            self.linear = nn.Linear(
+                hidden_size,
+                patch_size[0] * patch_size[1] * patch_size[2] * out_channels,
+                bias=True,
+            )
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+        # Here we don't distinguish between the modulate types. Just use the simple one.
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate_(self.norm_final(x), shift=shift, scale=scale)
+        x = self.linear(x)
+        return x

hyvideo/modules/models.py ADDED Viewed

	@@ -0,0 +1,1159 @@

+from typing import Any, List, Tuple, Optional, Union, Dict
+from einops import rearrange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from .activation_layers import get_activation_layer
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection
+from .attenion import attention, parallel_attention, get_cu_seqlens
+from .posemb_layers import apply_rotary_emb
+from .mlp_layers import MLP, MLPEmbedder, FinalLayer
+from .modulate_layers import ModulateDiT, modulate, modulate_ , apply_gate, apply_gate_and_accumulate_
+from .token_refiner import SingleTokenRefiner
+import numpy as np
+from mmgp import offload
+from wan.modules.attention import pay_attention
+from .audio_adapters import AudioProjNet2, PerceiverAttentionCA
+def get_linear_split_map():
+    hidden_size = 3072
+    split_linear_modules_map =  {
+                                "img_attn_qkv" : {"mapped_modules" : ["img_attn_q", "img_attn_k", "img_attn_v"] , "split_sizes": [hidden_size, hidden_size, hidden_size]},
+                                "linear1" : {"mapped_modules" : ["linear1_attn_q", "linear1_attn_k", "linear1_attn_v", "linear1_mlp"] , "split_sizes":  [hidden_size, hidden_size, hidden_size, 7*hidden_size- 3*hidden_size]}
+                                }
+    return split_linear_modules_map
+try:
+    from xformers.ops.fmha.attn_bias import BlockDiagonalPaddedKeysMask
+except ImportError:
+    BlockDiagonalPaddedKeysMask = None
+class MMDoubleStreamBlock(nn.Module):
+    """
+    A multimodal dit block with seperate modulation for
+    text and image/video, see more details (SD3): https://arxiv.org/abs/2403.03206
+                                     (Flux.1): https://github.com/black-forest-labs/flux
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qkv_bias: bool = False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attention_mode: str = "sdpa",
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attention_mode = attention_mode
+        self.deterministic = False
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.img_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.img_norm1 = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.img_attn_qkv = nn.Linear(
+            hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+        )
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.img_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.img_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.img_attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+        self.img_norm2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.img_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+        self.txt_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.txt_norm1 = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.txt_attn_qkv = nn.Linear(
+            hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+        )
+        self.txt_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.txt_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.txt_attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+        self.txt_norm2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.txt_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+        self.hybrid_seq_parallel_attn = None
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def forward(
+        self,
+        img: torch.Tensor,
+        txt: torch.Tensor,
+        vec: torch.Tensor,
+        attn_mask = None,
+        seqlens_q: Optional[torch.Tensor] = None,
+        seqlens_kv: Optional[torch.Tensor] = None,
+        freqs_cis: tuple = None,
+        condition_type: str = None,
+        token_replace_vec: torch.Tensor = None,
+        frist_frame_token_num: int = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if condition_type == "token_replace":
+            img_mod1, token_replace_img_mod1 = self.img_mod(vec, condition_type=condition_type, \
+                                                            token_replace_vec=token_replace_vec)
+            (img_mod1_shift,
+             img_mod1_scale,
+             img_mod1_gate,
+             img_mod2_shift,
+             img_mod2_scale,
+             img_mod2_gate) = img_mod1.chunk(6, dim=-1)
+            (tr_img_mod1_shift,
+             tr_img_mod1_scale,
+             tr_img_mod1_gate,
+             tr_img_mod2_shift,
+             tr_img_mod2_scale,
+             tr_img_mod2_gate) = token_replace_img_mod1.chunk(6, dim=-1)
+        else:
+            (
+                img_mod1_shift,
+                img_mod1_scale,
+                img_mod1_gate,
+                img_mod2_shift,
+                img_mod2_scale,
+                img_mod2_gate,
+            ) = self.img_mod(vec).chunk(6, dim=-1)
+        (
+            txt_mod1_shift,
+            txt_mod1_scale,
+            txt_mod1_gate,
+            txt_mod2_shift,
+            txt_mod2_scale,
+            txt_mod2_gate,
+        ) = self.txt_mod(vec).chunk(6, dim=-1)
+        ##### Enjoy this spagheti VRAM optimizations done by DeepBeepMeep !
+        # I am sure you are a nice person and as you copy this code, you will give me officially proper credits:
+        # Please link to https://github.com/deepbeepmeep/HunyuanVideoGP and @deepbeepmeep on twitter
+        # Prepare image for attention.
+        img_modulated = self.img_norm1(img)
+        img_modulated = img_modulated.to(torch.bfloat16)
+        if condition_type == "token_replace":
+            modulate_(img_modulated[:, :frist_frame_token_num], shift=tr_img_mod1_shift, scale=tr_img_mod1_scale)
+            modulate_(img_modulated[:, frist_frame_token_num:], shift=img_mod1_shift, scale=img_mod1_scale)
+        else:
+            modulate_( img_modulated, shift=img_mod1_shift, scale=img_mod1_scale )
+        shape = (*img_modulated.shape[:2], self.heads_num, int(img_modulated.shape[-1] / self.heads_num) )
+        img_q = self.img_attn_q(img_modulated).view(*shape)
+        img_k = self.img_attn_k(img_modulated).view(*shape)
+        img_v = self.img_attn_v(img_modulated).view(*shape)
+        del img_modulated
+        # Apply QK-Norm if needed
+        self.img_attn_q_norm.apply_(img_q).to(img_v)
+        img_q_len = img_q.shape[1]
+        self.img_attn_k_norm.apply_(img_k).to(img_v)
+        img_kv_len= img_k.shape[1]
+        batch_size = img_k.shape[0]
+        # Apply RoPE if needed.
+        qklist = [img_q, img_k]
+        del img_q, img_k
+        img_q, img_k = apply_rotary_emb(qklist, freqs_cis, head_first=False)
+        # Prepare txt for attention.
+        txt_modulated = self.txt_norm1(txt)
+        modulate_(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale )
+        txt_qkv = self.txt_attn_qkv(txt_modulated)
+        del txt_modulated
+        txt_q, txt_k, txt_v = rearrange(
+            txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num
+        )
+        del txt_qkv
+        # Apply QK-Norm if needed.
+        self.txt_attn_q_norm.apply_(txt_q).to(txt_v)
+        self.txt_attn_k_norm.apply_(txt_k).to(txt_v)
+        # Run actual attention.
+        q = torch.cat((img_q, txt_q), dim=1)
+        del img_q, txt_q
+        k = torch.cat((img_k, txt_k), dim=1)
+        del img_k, txt_k
+        v = torch.cat((img_v, txt_v), dim=1)
+        del img_v, txt_v
+        # attention computation start
+        qkv_list = [q,k,v]
+        del q, k, v
+        attn = pay_attention(
+            qkv_list,
+            attention_mask=attn_mask,
+            q_lens=seqlens_q,
+            k_lens=seqlens_kv,
+        )
+        b, s, a, d = attn.shape
+        attn = attn.reshape(b, s, -1)
+        del qkv_list
+        # attention computation end
+        img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1] :]
+        del attn
+        # Calculate the img bloks.
+        if condition_type == "token_replace":
+            img_attn = self.img_attn_proj(img_attn)
+            apply_gate_and_accumulate_(img[:, :frist_frame_token_num], img_attn[:, :frist_frame_token_num], gate=tr_img_mod1_gate)
+            apply_gate_and_accumulate_(img[:, frist_frame_token_num:], img_attn[:, frist_frame_token_num:], gate=img_mod1_gate)
+            del img_attn
+            img_modulated = self.img_norm2(img)
+            img_modulated = img_modulated.to(torch.bfloat16)
+            modulate_( img_modulated[:, :frist_frame_token_num], shift=tr_img_mod2_shift, scale=tr_img_mod2_scale)
+            modulate_( img_modulated[:, frist_frame_token_num:], shift=img_mod2_shift, scale=img_mod2_scale)
+            self.img_mlp.apply_(img_modulated)
+            apply_gate_and_accumulate_(img[:, :frist_frame_token_num], img_modulated[:, :frist_frame_token_num], gate=tr_img_mod2_gate)
+            apply_gate_and_accumulate_(img[:, frist_frame_token_num:], img_modulated[:, frist_frame_token_num:], gate=img_mod2_gate)
+            del img_modulated
+        else:
+            img_attn = self.img_attn_proj(img_attn)
+            apply_gate_and_accumulate_(img, img_attn, gate=img_mod1_gate)
+            del img_attn
+            img_modulated = self.img_norm2(img)
+            img_modulated = img_modulated.to(torch.bfloat16)
+            modulate_( img_modulated , shift=img_mod2_shift, scale=img_mod2_scale)
+            self.img_mlp.apply_(img_modulated)
+            apply_gate_and_accumulate_(img, img_modulated, gate=img_mod2_gate)
+            del img_modulated
+        # Calculate the txt bloks.
+        txt_attn  = self.txt_attn_proj(txt_attn)
+        apply_gate_and_accumulate_(txt, txt_attn, gate=txt_mod1_gate)
+        del txt_attn
+        txt_modulated = self.txt_norm2(txt)
+        txt_modulated = txt_modulated.to(torch.bfloat16)
+        modulate_(txt_modulated, shift=txt_mod2_shift, scale=txt_mod2_scale)
+        txt_mlp = self.txt_mlp(txt_modulated)
+        del txt_modulated
+        apply_gate_and_accumulate_(txt, txt_mlp, gate=txt_mod2_gate)
+        return img, txt
+class MMSingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    Also refer to (SD3): https://arxiv.org/abs/2403.03206
+                  (Flux.1): https://github.com/black-forest-labs/flux
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qk_scale: float = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attention_mode: str = "sdpa",
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attention_mode = attention_mode
+        self.deterministic = False
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.mlp_hidden_dim = mlp_hidden_dim
+        self.scale = qk_scale or head_dim ** -0.5
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(
+            hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs
+        )
+        # proj and mlp_out
+        self.linear2 = nn.Linear(
+            hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs
+        )
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.pre_norm = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.mlp_act = get_activation_layer(mlp_act_type)()
+        self.modulation = ModulateDiT(
+            hidden_size,
+            factor=3,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.hybrid_seq_parallel_attn = None
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def forward(
+        self,
+        # x: torch.Tensor,
+        img: torch.Tensor,
+        txt: torch.Tensor,
+        vec: torch.Tensor,
+        txt_len: int,
+        attn_mask= None,
+        seqlens_q: Optional[torch.Tensor] = None,
+        seqlens_kv: Optional[torch.Tensor] = None,
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        condition_type: str = None,
+        token_replace_vec: torch.Tensor = None,
+        frist_frame_token_num: int = None,
+    ) -> torch.Tensor:
+        ##### More spagheti VRAM optimizations done by DeepBeepMeep !
+        # I am sure you are a nice person and as you copy this code, you will give me proper credits:
+        # Please link to https://github.com/deepbeepmeep/HunyuanVideoGP and @deepbeepmeep on twitter
+        if condition_type == "token_replace":
+            mod, tr_mod = self.modulation(vec,
+                                          condition_type=condition_type,
+                                          token_replace_vec=token_replace_vec)
+            (mod_shift,
+             mod_scale,
+             mod_gate) = mod.chunk(3, dim=-1)
+            (tr_mod_shift,
+             tr_mod_scale,
+             tr_mod_gate) = tr_mod.chunk(3, dim=-1)
+        else:
+            mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
+        img_mod = self.pre_norm(img)
+        img_mod = img_mod.to(torch.bfloat16)
+        if condition_type == "token_replace":
+            modulate_(img_mod[:, :frist_frame_token_num], shift=tr_mod_shift, scale=tr_mod_scale)
+            modulate_(img_mod[:, frist_frame_token_num:], shift=mod_shift, scale=mod_scale)
+        else:
+            modulate_(img_mod, shift=mod_shift, scale=mod_scale)
+        txt_mod = self.pre_norm(txt)
+        txt_mod = txt_mod.to(torch.bfloat16)
+        modulate_(txt_mod, shift=mod_shift, scale=mod_scale)
+        shape = (*img_mod.shape[:2], self.heads_num, int(img_mod.shape[-1] / self.heads_num) )
+        img_q = self.linear1_attn_q(img_mod).view(*shape)
+        img_k = self.linear1_attn_k(img_mod).view(*shape)
+        img_v = self.linear1_attn_v(img_mod).view(*shape)
+        shape = (*txt_mod.shape[:2], self.heads_num, int(txt_mod.shape[-1] / self.heads_num) )
+        txt_q = self.linear1_attn_q(txt_mod).view(*shape)
+        txt_k = self.linear1_attn_k(txt_mod).view(*shape)
+        txt_v = self.linear1_attn_v(txt_mod).view(*shape)
+        batch_size = img_mod.shape[0]
+        # Apply QK-Norm if needed.
+        # q = self.q_norm(q).to(v)
+        self.q_norm.apply_(img_q)
+        self.k_norm.apply_(img_k)
+        self.q_norm.apply_(txt_q)
+        self.k_norm.apply_(txt_k)
+        qklist = [img_q, img_k]
+        del img_q, img_k
+        img_q, img_k = apply_rotary_emb(qklist, freqs_cis, head_first=False)
+        img_q_len=img_q.shape[1]
+        q = torch.cat((img_q, txt_q), dim=1)
+        del img_q, txt_q
+        k = torch.cat((img_k, txt_k), dim=1)
+        img_kv_len=img_k.shape[1]
+        del img_k, txt_k
+        v = torch.cat((img_v, txt_v), dim=1)
+        del img_v, txt_v
+        # attention computation start
+        qkv_list = [q,k,v]
+        del q, k, v
+        attn = pay_attention(
+            qkv_list,
+            attention_mask=attn_mask,
+            q_lens = seqlens_q,
+            k_lens = seqlens_kv,
+        )
+        b, s, a, d = attn.shape
+        attn = attn.reshape(b, s, -1)
+        del qkv_list
+        # attention computation end
+        x_mod =  torch.cat((img_mod, txt_mod), 1)
+        del img_mod, txt_mod
+        x_mod_shape = x_mod.shape
+        x_mod = x_mod.view(-1, x_mod.shape[-1])
+        chunk_size = int(x_mod_shape[1]/6)
+        x_chunks = torch.split(x_mod, chunk_size)
+        attn = attn.view(-1, attn.shape[-1])
+        attn_chunks =torch.split(attn, chunk_size)
+        for x_chunk, attn_chunk in zip(x_chunks, attn_chunks):
+            mlp_chunk = self.linear1_mlp(x_chunk)
+            mlp_chunk = self.mlp_act(mlp_chunk)
+            attn_mlp_chunk = torch.cat((attn_chunk, mlp_chunk), -1)
+            del attn_chunk, mlp_chunk
+            x_chunk[...] = self.linear2(attn_mlp_chunk)
+            del attn_mlp_chunk
+        x_mod = x_mod.view(x_mod_shape)
+        if condition_type == "token_replace":
+            apply_gate_and_accumulate_(img[:, :frist_frame_token_num, :], x_mod[:, :frist_frame_token_num, :], gate=tr_mod_gate)
+            apply_gate_and_accumulate_(img[:, frist_frame_token_num:, :], x_mod[:, frist_frame_token_num:-txt_len, :], gate=mod_gate)
+        else:
+            apply_gate_and_accumulate_(img, x_mod[:, :-txt_len, :], gate=mod_gate)
+        apply_gate_and_accumulate_(txt, x_mod[:, -txt_len:, :], gate=mod_gate)
+        return img, txt
+class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
+    def preprocess_loras(self, model_filename, sd):
+        if not "i2v" in model_filename:
+            return sd
+        new_sd = {}
+        for k,v in sd.items():
+            repl_list = ["double_blocks", "single_blocks", "final_layer", "img_mlp", "img_attn_qkv", "img_attn_proj","img_mod", "txt_mlp", "txt_attn_qkv","txt_attn_proj", "txt_mod", "linear1",
+                        "linear2", "modulation",  "mlp_fc1"]
+            src_list = [k +"_" for k in repl_list] +  ["_" + k for k in repl_list]
+            tgt_list = [k +"." for k in repl_list] +  ["." + k for k in repl_list]
+            if k.startswith("Hunyuan_video_I2V_lora_"):
+                # crappy conversion script for non reversible lora naming
+                k = k.replace("Hunyuan_video_I2V_lora_","diffusion_model.")
+                k = k.replace("lora_up","lora_B")
+                k = k.replace("lora_down","lora_A")
+                if "txt_in_individual" in k:
+                    pass
+                for s,t in zip(src_list, tgt_list):
+                    k = k.replace(s,t)
+                if  "individual_token_refiner" in k:
+                    k = k.replace("txt_in_individual_token_refiner_blocks_", "txt_in.individual_token_refiner.blocks.")
+                    k = k.replace("_mlp_fc", ".mlp.fc",)
+                    k = k.replace(".mlp_fc", ".mlp.fc",)
+            new_sd[k] = v
+        return new_sd
+    """
+    HunyuanVideo Transformer backbone
+    Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline.
+    Reference:
+    [1] Flux.1: https://github.com/black-forest-labs/flux
+    [2] MMDiT: http://arxiv.org/abs/2403.03206
+    Parameters
+    ----------
+    args: argparse.Namespace
+        The arguments parsed by argparse.
+    patch_size: list
+        The size of the patch.
+    in_channels: int
+        The number of input channels.
+    out_channels: int
+        The number of output channels.
+    hidden_size: int
+        The hidden size of the transformer backbone.
+    heads_num: int
+        The number of attention heads.
+    mlp_width_ratio: float
+        The ratio of the hidden size of the MLP in the transformer block.
+    mlp_act_type: str
+        The activation function of the MLP in the transformer block.
+    depth_double_blocks: int
+        The number of transformer blocks in the double blocks.
+    depth_single_blocks: int
+        The number of transformer blocks in the single blocks.
+    rope_dim_list: list
+        The dimension of the rotary embedding for t, h, w.
+    qkv_bias: bool
+        Whether to use bias in the qkv linear layer.
+    qk_norm: bool
+        Whether to use qk norm.
+    qk_norm_type: str
+        The type of qk norm.
+    guidance_embed: bool
+        Whether to use guidance embedding for distillation.
+    text_projection: str
+        The type of the text projection, default is single_refiner.
+    use_attention_mask: bool
+        Whether to use attention mask for text encoder.
+    dtype: torch.dtype
+        The dtype of the model.
+    device: torch.device
+        The device of the model.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        i2v_condition_type,
+        patch_size: list = [1, 2, 2],
+        in_channels: int = 4,  # Should be VAE.config.latent_channels.
+        out_channels: int = None,
+        hidden_size: int = 3072,
+        heads_num: int = 24,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        mm_double_blocks_depth: int = 20,
+        mm_single_blocks_depth: int = 40,
+        rope_dim_list: List[int] = [16, 56, 56],
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        guidance_embed: bool = False,  # For modulation.
+        text_projection: str = "single_refiner",
+        use_attention_mask: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attention_mode: Optional[str] = "sdpa",
+        video_condition: bool = False,
+        audio_condition: bool = False,
+        avatar = False,
+        custom = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        # mm_double_blocks_depth , mm_single_blocks_depth = 5, 5
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.unpatchify_channels = self.out_channels
+        self.guidance_embed = guidance_embed
+        self.rope_dim_list = rope_dim_list
+        self.i2v_condition_type = i2v_condition_type
+        self.attention_mode = attention_mode
+        self.video_condition = video_condition
+        self.audio_condition = audio_condition
+        self.avatar = avatar
+        self.custom = custom
+        # Text projection. Default to linear projection.
+        # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
+        self.use_attention_mask = use_attention_mask
+        self.text_projection = text_projection
+        self.text_states_dim = 4096
+        self.text_states_dim_2 = 768
+        if hidden_size % heads_num != 0:
+            raise ValueError(
+                f"Hidden size {hidden_size} must be divisible by heads_num {heads_num}"
+            )
+        pe_dim = hidden_size // heads_num
+        if sum(rope_dim_list) != pe_dim:
+            raise ValueError(
+                f"Got {rope_dim_list} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        # image projection
+        self.img_in = PatchEmbed(
+            self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs
+        )
+        # text projection
+        if self.text_projection == "linear":
+            self.txt_in = TextProjection(
+                self.text_states_dim,
+                self.hidden_size,
+                get_activation_layer("silu"),
+                **factory_kwargs,
+            )
+        elif self.text_projection == "single_refiner":
+            self.txt_in = SingleTokenRefiner(
+                self.text_states_dim, hidden_size, heads_num, depth=2, **factory_kwargs
+            )
+        else:
+            raise NotImplementedError(
+                f"Unsupported text_projection: {self.text_projection}"
+            )
+        # time modulation
+        self.time_in = TimestepEmbedder(
+            self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+        )
+        # text modulation
+        self.vector_in = MLPEmbedder(
+            self.text_states_dim_2, self.hidden_size, **factory_kwargs
+        )
+        # guidance modulation
+        self.guidance_in = (
+            TimestepEmbedder(
+                self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+            )
+            if guidance_embed
+            else None
+        )
+        # double blocks
+        self.double_blocks = nn.ModuleList(
+            [
+                MMDoubleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    attention_mode = attention_mode,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_double_blocks_depth)
+            ]
+        )
+        # single blocks
+        self.single_blocks = nn.ModuleList(
+            [
+                MMSingleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    attention_mode = attention_mode,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_single_blocks_depth)
+            ]
+        )
+        self.final_layer = FinalLayer(
+            self.hidden_size,
+            self.patch_size,
+            self.out_channels,
+            get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        if self.video_condition:
+            self.bg_in = PatchEmbed(
+                self.patch_size, self.in_channels * 2, self.hidden_size, **factory_kwargs
+            )
+            self.bg_proj = nn.Linear(self.hidden_size, self.hidden_size)
+        if audio_condition:
+            if avatar:
+                self.ref_in = PatchEmbed(
+                    self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs
+                    )
+                # -------------------- audio_proj_model --------------------
+                self.audio_proj = AudioProjNet2(seq_len=10, blocks=5, channels=384, intermediate_dim=1024, output_dim=3072, context_tokens=4)
+                # -------------------- motion-embeder --------------------
+                self.motion_exp = TimestepEmbedder(
+                        self.hidden_size // 4,
+                        get_activation_layer("silu"),
+                        **factory_kwargs
+                    )
+                self.motion_pose = TimestepEmbedder(
+                        self.hidden_size // 4,
+                        get_activation_layer("silu"),
+                        **factory_kwargs
+                    )
+                self.fps_proj = TimestepEmbedder(
+                        self.hidden_size,
+                        get_activation_layer("silu"),
+                        **factory_kwargs
+                    )
+                self.before_proj = nn.Linear(self.hidden_size, self.hidden_size)
+                # -------------------- audio_insert_model --------------------
+                self.double_stream_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
+                audio_block_name = "audio_adapter_blocks"
+            elif custom:
+                self.audio_proj = AudioProjNet2(seq_len=10, blocks=5, channels=384, intermediate_dim=1024, output_dim=3072, context_tokens=4)
+                self.double_stream_list = [1, 3, 5, 7, 9, 11]
+                audio_block_name = "audio_models"
+            self.double_stream_map = {str(i): j for j, i in enumerate(self.double_stream_list)}
+            self.single_stream_list = []
+            self.single_stream_map = {str(i): j+len(self.double_stream_list) for j, i in enumerate(self.single_stream_list)}
+            setattr(self, audio_block_name,  nn.ModuleList([
+                PerceiverAttentionCA(dim=3072, dim_head=1024, heads=33) for _ in range(len(self.double_stream_list) + len(self.single_stream_list))
+            ]))
+    def lock_layers_dtypes(self, dtype = torch.float32):
+        layer_list = [self.final_layer, self.final_layer.linear, self.final_layer.adaLN_modulation[1]]
+        target_dype= dtype
+        for current_layer_list, current_dtype in zip([layer_list], [target_dype]):
+            for layer in current_layer_list:
+                layer._lock_dtype = dtype
+                if hasattr(layer, "weight") and layer.weight.dtype != current_dtype :
+                    layer.weight.data = layer.weight.data.to(current_dtype)
+                    if hasattr(layer, "bias"):
+                        layer.bias.data = layer.bias.data.to(current_dtype)
+        self._lock_dtype = dtype
+    def enable_deterministic(self):
+        for block in self.double_blocks:
+            block.enable_deterministic()
+        for block in self.single_blocks:
+            block.enable_deterministic()
+    def disable_deterministic(self):
+        for block in self.double_blocks:
+            block.disable_deterministic()
+        for block in self.single_blocks:
+            block.disable_deterministic()
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,  # Should be in range(0, 1000).
+        ref_latents: torch.Tensor=None,
+        text_states: torch.Tensor = None,
+        text_mask: torch.Tensor = None,  # Now we don't use it.
+        text_states_2: Optional[torch.Tensor] = None,  # Text embedding for modulation.
+        freqs_cos: Optional[torch.Tensor] = None,
+        freqs_sin: Optional[torch.Tensor] = None,
+        guidance: torch.Tensor = None,  # Guidance for modulation, should be cfg_scale x 1000.
+        pipeline=None,
+        x_id = 0,
+        step_no = 0,
+        callback = None,
+        audio_prompts = None,
+        motion_exp = None,
+        motion_pose = None,
+        fps = None,
+        face_mask = None,
+        audio_strength = None,
+        bg_latents = None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        img = x
+        bsz, _, ot, oh, ow = x.shape
+        del x
+        txt = text_states
+        tt, th, tw = (
+            ot // self.patch_size[0],
+            oh // self.patch_size[1],
+            ow // self.patch_size[2],
+        )
+        # Prepare modulation vectors.
+        vec = self.time_in(t)
+        if motion_exp != None:
+            vec += self.motion_exp(motion_exp.view(-1)).view(bsz, -1)     # (b, 3072)
+        if motion_pose != None:
+            vec += self.motion_pose(motion_pose.view(-1)).view(bsz, -1)  # (b, 3072)
+        if fps != None:
+            vec += self.fps_proj(fps)   # (b, 3072)
+        if audio_prompts != None:
+            audio_feature_all = self.audio_proj(audio_prompts)
+            audio_feature_pad = audio_feature_all[:,:1].repeat(1,3,1,1)
+            audio_feature_all_insert = torch.cat([audio_feature_pad, audio_feature_all], dim=1).view(bsz, ot, 16, 3072)
+            audio_feature_all = None
+        if self.i2v_condition_type == "token_replace":
+            token_replace_t = torch.zeros_like(t)
+            token_replace_vec = self.time_in(token_replace_t)
+            frist_frame_token_num = th * tw
+        else:
+            token_replace_vec = None
+            frist_frame_token_num = None
+            # token_replace_mask_img = None
+            # token_replace_mask_txt = None
+        # text modulation
+        vec_2 = self.vector_in(text_states_2)
+        del text_states_2
+        vec += vec_2
+        if self.i2v_condition_type == "token_replace":
+            token_replace_vec += vec_2
+        del vec_2
+        # guidance modulation
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
+            vec += self.guidance_in(guidance)
+        # Embed image and text.
+        img, shape_mask = self.img_in(img)
+        if self.avatar:
+            ref_latents_first = ref_latents[:, :, :1].clone()
+            ref_latents,_ = self.ref_in(ref_latents)
+            ref_latents_first,_ = self.img_in(ref_latents_first)
+        elif self.custom:
+            if ref_latents != None:
+                ref_latents, _ = self.img_in(ref_latents)
+            if bg_latents is not None and self.video_condition:
+                bg_latents, _ = self.bg_in(bg_latents)
+                img += self.bg_proj(bg_latents)
+        if self.text_projection == "linear":
+            txt = self.txt_in(txt)
+        elif self.text_projection == "single_refiner":
+            txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
+        else:
+            raise NotImplementedError(
+                f"Unsupported text_projection: {self.text_projection}"
+            )
+        if self.avatar:
+            img += self.before_proj(ref_latents)
+            ref_length = ref_latents_first.shape[-2]          # [b s c]
+            img = torch.cat([ref_latents_first, img], dim=-2) # t c
+            img_len = img.shape[1]
+            mask_len = img_len - ref_length
+            if face_mask.shape[2] == 1:
+                face_mask = face_mask.repeat(1,1,ot,1,1)  # repeat if number of mask frame is 1
+            face_mask = torch.nn.functional.interpolate(face_mask, size=[ot, shape_mask[-2], shape_mask[-1]], mode="nearest")
+            # face_mask = face_mask.view(-1,mask_len,1).repeat(1,1,img.shape[-1]).type_as(img)
+            face_mask = face_mask.view(-1,mask_len,1).type_as(img)
+        elif ref_latents == None:
+            ref_length  = None
+        else:
+            ref_length = ref_latents.shape[-2]
+            img = torch.cat([ref_latents, img], dim=-2) # t c
+        txt_seq_len = txt.shape[1]
+        img_seq_len = img.shape[1]
+        text_len = text_mask.sum(1)
+        total_len = text_len + img_seq_len
+        seqlens_q = seqlens_kv = total_len
+        attn_mask = None
+        freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+        if self.enable_teacache:
+            if x_id == 0:
+                self.should_calc = True
+                inp = img[0:1]
+                vec_ = vec[0:1]
+                ( img_mod1_shift, img_mod1_scale, _ , _ , _ , _ , ) = self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1)
+                normed_inp = self.double_blocks[0].img_norm1(inp)
+                normed_inp = normed_inp.to(torch.bfloat16)
+                modulated_inp = modulate( normed_inp, shift=img_mod1_shift, scale=img_mod1_scale )
+                del normed_inp, img_mod1_shift, img_mod1_scale
+                if step_no <= self.teacache_start_step or step_no == self.num_steps-1:
+                    self.accumulated_rel_l1_distance = 0
+                else:
+                    coefficients = [7.33226126e+02, -4.01131952e+02,  6.75869174e+01, -3.14987800e+00, 9.61237896e-02]
+                    rescale_func = np.poly1d(coefficients)
+                    self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+                        self.should_calc = False
+                        self.teacache_skipped_steps += 1
+                    else:
+                        self.accumulated_rel_l1_distance = 0
+                self.previous_modulated_input = modulated_inp
+        else:
+            self.should_calc = True
+        if not self.should_calc:
+            img += self.previous_residual[x_id]
+        else:
+            if self.enable_teacache:
+                self.previous_residual[x_id] = None
+                ori_img = img[0:1].clone()
+            # --------------------- Pass through DiT blocks ------------------------
+            for layer_num, block in enumerate(self.double_blocks):
+                for i in range(len(img)):
+                    if callback != None:
+                        callback(-1, None, False, True)
+                    if pipeline._interrupt:
+                        return None
+                    double_block_args = [
+                        img[i:i+1],
+                        txt[i:i+1],
+                        vec[i:i+1],
+                        attn_mask,
+                        seqlens_q[i:i+1],
+                        seqlens_kv[i:i+1],
+                        freqs_cis,
+                        self.i2v_condition_type,
+                        token_replace_vec,
+                        frist_frame_token_num,
+                    ]
+                    img[i], txt[i] = block(*double_block_args)
+                    double_block_args = None
+                    # insert audio feature to img
+                    if audio_prompts != None:
+                        audio_adapter = getattr(self.double_blocks[layer_num], "audio_adapter", None)
+                        if audio_adapter != None:
+                            real_img = img[i:i+1,ref_length:].view(1, ot, -1, 3072)
+                            real_img = audio_adapter(audio_feature_all_insert[i:i+1], real_img).view(1, -1, 3072)
+                            if face_mask != None:
+                                real_img *= face_mask[i:i+1]
+                            if audio_strength != None and audio_strength != 1:
+                                real_img *= audio_strength
+                            img[i:i+1, ref_length:] += real_img
+                            real_img = None
+            for _, block in enumerate(self.single_blocks):
+                for i in range(len(img)):
+                    if callback != None:
+                        callback(-1, None, False, True)
+                    if pipeline._interrupt:
+                        return None
+                    single_block_args = [
+                        # x,
+                        img[i:i+1],
+                        txt[i:i+1],
+                        vec[i:i+1],
+                        txt_seq_len,
+                        attn_mask,
+                        seqlens_q[i:i+1],
+                        seqlens_kv[i:i+1],
+                        (freqs_cos, freqs_sin),
+                        self.i2v_condition_type,
+                        token_replace_vec,
+                        frist_frame_token_num,
+                    ]
+                    img[i], txt[i] = block(*single_block_args)
+                    single_block_args = None
+            # img = x[:, :img_seq_len, ...]
+            if self.enable_teacache:
+                if len(img) > 1:
+                    self.previous_residual[0] = torch.empty_like(img)
+                    for i, (x, residual) in enumerate(zip(img, self.previous_residual[0])):
+                        if i < len(img) - 1:
+                            residual[...] = torch.sub(x, ori_img)
+                        else:
+                            residual[...] = ori_img
+                            torch.sub(x, ori_img, out=residual)
+                    x = None
+                else:
+                    self.previous_residual[x_id] = ori_img
+                    torch.sub(img, ori_img, out=self.previous_residual[x_id])
+        if ref_length != None:
+            img = img[:, ref_length:]
+        # ---------------------------- Final layer ------------------------------
+        out_dtype = self.final_layer.linear.weight.dtype
+        vec = vec.to(out_dtype)
+        img_list  = []
+        for img_chunk, vec_chunk in zip(img,vec):
+             img_list.append( self.final_layer(img_chunk.to(out_dtype).unsqueeze(0), vec_chunk.unsqueeze(0))) # (N, T, patch_size ** 2 * out_channels)
+        img = torch.cat(img_list)
+        img_list = None
+        # img = self.unpatchify(img, tt, th, tw)
+        img = self.unpatchify(img, tt, th, tw)
+        return img
+    def unpatchify(self, x, t, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.unpatchify_channels
+        pt, ph, pw = self.patch_size
+        assert t * h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw))
+        x = torch.einsum("nthwcopq->nctohpwq", x)
+        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
+        return imgs
+    def params_count(self):
+        counts = {
+            "double": sum(
+                [
+                    sum(p.numel() for p in block.img_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.img_attn_proj.parameters())
+                    + sum(p.numel() for p in block.img_mlp.parameters())
+                    + sum(p.numel() for p in block.txt_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.txt_attn_proj.parameters())
+                    + sum(p.numel() for p in block.txt_mlp.parameters())
+                    for block in self.double_blocks
+                ]
+            ),
+            "single": sum(
+                [
+                    sum(p.numel() for p in block.linear1.parameters())
+                    + sum(p.numel() for p in block.linear2.parameters())
+                    for block in self.single_blocks
+                ]
+            ),
+            "total": sum(p.numel() for p in self.parameters()),
+        }
+        counts["attn+mlp"] = counts["double"] + counts["single"]
+        return counts
+#################################################################################
+#                             HunyuanVideo Configs                              #
+#################################################################################
+HUNYUAN_VIDEO_CONFIG = {
+    "HYVideo-T/2": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+    },
+    "HYVideo-T/2-cfgdistill": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        "guidance_embed": True,
+    },
+    "HYVideo-S/2": {
+        "mm_double_blocks_depth": 6,
+        "mm_single_blocks_depth": 12,
+        "rope_dim_list": [12, 42, 42],
+        "hidden_size": 480,
+        "heads_num": 5,
+        "mlp_width_ratio": 4,
+    },
+    'HYVideo-T/2-custom': {                                                                       #   9.0B   / 12.5B
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        'custom' : True
+    },
+    'HYVideo-T/2-custom-audio': {                                                                       #   9.0B   / 12.5B
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        'custom' : True,
+        'audio_condition' : True,
+    },
+    'HYVideo-T/2-custom-edit': {                                                                       #   9.0B   / 12.5B
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        'custom' : True,
+        'video_condition' : True,
+    },
+    'HYVideo-T/2-avatar': {                                                                       #   9.0B   / 12.5B
+        'mm_double_blocks_depth': 20,
+        'mm_single_blocks_depth': 40,
+        'rope_dim_list': [16, 56, 56],
+        'hidden_size': 3072,
+        'heads_num': 24,
+        'mlp_width_ratio': 4,
+        'avatar': True,
+        'audio_condition' : True,
+    },
+}

hyvideo/modules/modulate_layers.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from typing import Callable
+import torch
+import torch.nn as nn
+import math
+class ModulateDiT(nn.Module):
+    """Modulation layer for DiT."""
+    def __init__(
+        self,
+        hidden_size: int,
+        factor: int,
+        act_layer: Callable,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.act = act_layer()
+        self.linear = nn.Linear(
+            hidden_size, factor * hidden_size, bias=True, **factory_kwargs
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+    def forward(self, x: torch.Tensor, condition_type=None, token_replace_vec=None) -> torch.Tensor:
+        x_out = self.linear(self.act(x))
+        if condition_type == "token_replace":
+            x_token_replace_out = self.linear(self.act(token_replace_vec))
+            return x_out, x_token_replace_out
+        else:
+            return x_out
+def modulate(x, shift=None, scale=None):
+    """modulate by shift and scale
+    Args:
+        x (torch.Tensor): input tensor.
+        shift (torch.Tensor, optional): shift tensor. Defaults to None.
+        scale (torch.Tensor, optional): scale tensor. Defaults to None.
+    Returns:
+        torch.Tensor: the output tensor after modulate.
+    """
+    if scale is None and shift is None:
+        return x
+    elif shift is None:
+        return x * (1 + scale.unsqueeze(1))
+    elif scale is None:
+        return x + shift.unsqueeze(1)
+    else:
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def modulate_(x, shift=None, scale=None):
+    if scale is None and shift is None:
+        return x
+    elif shift is None:
+        scale = scale + 1
+        scale = scale.unsqueeze(1)
+        return x.mul_(scale)
+    elif scale is None:
+        return x + shift.unsqueeze(1)
+    else:
+        scale = scale + 1
+        scale = scale.unsqueeze(1)
+        # return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        torch.addcmul(shift.unsqueeze(1), x,  scale, out =x )
+        return x
+def modulate(x, shift=None, scale=None, condition_type=None,
+             tr_shift=None, tr_scale=None,
+             frist_frame_token_num=None):
+    if condition_type == "token_replace":
+        x_zero = x[:, :frist_frame_token_num] * (1 + tr_scale.unsqueeze(1)) + tr_shift.unsqueeze(1)
+        x_orig = x[:, frist_frame_token_num:] * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        x = torch.concat((x_zero, x_orig), dim=1)
+        return x
+    else:
+        if scale is None and shift is None:
+            return x
+        elif shift is None:
+            return x * (1 + scale.unsqueeze(1))
+        elif scale is None:
+            return x + shift.unsqueeze(1)
+        else:
+            return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def apply_gate(x, gate=None, tanh=False, condition_type=None, tr_gate=None, frist_frame_token_num=None):
+    """AI is creating summary for apply_gate
+    Args:
+        x (torch.Tensor): input tensor.
+        gate (torch.Tensor, optional): gate tensor. Defaults to None.
+        tanh (bool, optional): whether to use tanh function. Defaults to False.
+    Returns:
+        torch.Tensor: the output tensor after apply gate.
+    """
+    if condition_type == "token_replace":
+        if gate is None:
+            return x
+        if tanh:
+            x_zero = x[:, :frist_frame_token_num] * tr_gate.unsqueeze(1).tanh()
+            x_orig = x[:, frist_frame_token_num:] * gate.unsqueeze(1).tanh()
+            x = torch.concat((x_zero, x_orig), dim=1)
+            return x
+        else:
+            x_zero = x[:, :frist_frame_token_num] * tr_gate.unsqueeze(1)
+            x_orig = x[:, frist_frame_token_num:] * gate.unsqueeze(1)
+            x = torch.concat((x_zero, x_orig), dim=1)
+            return x
+    else:
+        if gate is None:
+            return x
+        if tanh:
+            return x * gate.unsqueeze(1).tanh()
+        else:
+            return x * gate.unsqueeze(1)
+def apply_gate_and_accumulate_(accumulator, x, gate=None, tanh=False):
+    if gate is None:
+        return accumulator
+    if tanh:
+        return accumulator.addcmul_(x, gate.unsqueeze(1).tanh())
+    else:
+        return accumulator.addcmul_(x, gate.unsqueeze(1))
+def ckpt_wrapper(module):
+    def ckpt_forward(*inputs):
+        outputs = module(*inputs)
+        return outputs
+    return ckpt_forward

hyvideo/modules/norm_layers.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import torch.nn as nn
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        elementwise_affine=True,
+        eps: float = 1e-6,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            output = output * self.weight
+        return output
+    def apply_(self, x):
+        y = x.pow(2).mean(-1, keepdim=True)
+        y.add_(self.eps)
+        y.rsqrt_()
+        x.mul_(y)
+        del y
+        if hasattr(self, "weight"):
+            x.mul_(self.weight)
+        return x
+def get_norm_layer(norm_layer):
+    """
+    Get the normalization layer.
+    Args:
+        norm_layer (str): The type of normalization layer.
+    Returns:
+        norm_layer (nn.Module): The normalization layer.
+    """
+    if norm_layer == "layer":
+        return nn.LayerNorm
+    elif norm_layer == "rms":
+        return RMSNorm
+    else:
+        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")

hyvideo/modules/original models.py ADDED Viewed

	@@ -0,0 +1,760 @@

+from typing import Any, List, Tuple, Optional, Union, Dict
+from einops import rearrange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from .activation_layers import get_activation_layer
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection
+from .attenion import attention, parallel_attention, get_cu_seqlens
+from .posemb_layers import apply_rotary_emb
+from .mlp_layers import MLP, MLPEmbedder, FinalLayer
+from .modulate_layers import ModulateDiT, modulate, apply_gate
+from .token_refiner import SingleTokenRefiner
+class MMDoubleStreamBlock(nn.Module):
+    """
+    A multimodal dit block with seperate modulation for
+    text and image/video, see more details (SD3): https://arxiv.org/abs/2403.03206
+                                     (Flux.1): https://github.com/black-forest-labs/flux
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qkv_bias: bool = False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.deterministic = False
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.img_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.img_norm1 = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.img_attn_qkv = nn.Linear(
+            hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+        )
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.img_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.img_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.img_attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+        self.img_norm2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.img_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+        self.txt_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.txt_norm1 = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.txt_attn_qkv = nn.Linear(
+            hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+        )
+        self.txt_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.txt_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.txt_attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+        self.txt_norm2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.txt_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+        self.hybrid_seq_parallel_attn = None
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def forward(
+        self,
+        img: torch.Tensor,
+        txt: torch.Tensor,
+        vec: torch.Tensor,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: tuple = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        (
+            img_mod1_shift,
+            img_mod1_scale,
+            img_mod1_gate,
+            img_mod2_shift,
+            img_mod2_scale,
+            img_mod2_gate,
+        ) = self.img_mod(vec).chunk(6, dim=-1)
+        (
+            txt_mod1_shift,
+            txt_mod1_scale,
+            txt_mod1_gate,
+            txt_mod2_shift,
+            txt_mod2_scale,
+            txt_mod2_gate,
+        ) = self.txt_mod(vec).chunk(6, dim=-1)
+        # Prepare image for attention.
+        img_modulated = self.img_norm1(img)
+        img_modulated = modulate(
+            img_modulated, shift=img_mod1_shift, scale=img_mod1_scale
+        )
+        img_qkv = self.img_attn_qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(
+            img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num
+        )
+        # Apply QK-Norm if needed
+        img_q = self.img_attn_q_norm(img_q).to(img_v)
+        img_k = self.img_attn_k_norm(img_k).to(img_v)
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_qq.shape == img_q.shape and img_kk.shape == img_k.shape
+            ), f"img_kk: {img_qq.shape}, img_q: {img_q.shape}, img_kk: {img_kk.shape}, img_k: {img_k.shape}"
+            img_q, img_k = img_qq, img_kk
+        # Prepare txt for attention.
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = modulate(
+            txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale
+        )
+        txt_qkv = self.txt_attn_qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(
+            txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num
+        )
+        # Apply QK-Norm if needed.
+        txt_q = self.txt_attn_q_norm(txt_q).to(txt_v)
+        txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
+        # Run actual attention.
+        q = torch.cat((img_q, txt_q), dim=1)
+        k = torch.cat((img_k, txt_k), dim=1)
+        v = torch.cat((img_v, txt_v), dim=1)
+        assert (
+            cu_seqlens_q.shape[0] == 2 * img.shape[0] + 1
+        ), f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, img.shape[0]:{img.shape[0]}"
+        # attention computation start
+        if not self.hybrid_seq_parallel_attn:
+            attn = attention(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                batch_size=img_k.shape[0],
+            )
+        else:
+            attn = parallel_attention(
+                self.hybrid_seq_parallel_attn,
+                q,
+                k,
+                v,
+                img_q_len=img_q.shape[1],
+                img_kv_len=img_k.shape[1],
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv
+            )
+        # attention computation end
+        img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1] :]
+        # Calculate the img bloks.
+        img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
+        img = img + apply_gate(
+            self.img_mlp(
+                modulate(
+                    self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale
+                )
+            ),
+            gate=img_mod2_gate,
+        )
+        # Calculate the txt bloks.
+        txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
+        txt = txt + apply_gate(
+            self.txt_mlp(
+                modulate(
+                    self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale
+                )
+            ),
+            gate=txt_mod2_gate,
+        )
+        return img, txt
+class MMSingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    Also refer to (SD3): https://arxiv.org/abs/2403.03206
+                  (Flux.1): https://github.com/black-forest-labs/flux
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qk_scale: float = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.deterministic = False
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.mlp_hidden_dim = mlp_hidden_dim
+        self.scale = qk_scale or head_dim ** -0.5
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(
+            hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs
+        )
+        # proj and mlp_out
+        self.linear2 = nn.Linear(
+            hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs
+        )
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.pre_norm = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        self.mlp_act = get_activation_layer(mlp_act_type)()
+        self.modulation = ModulateDiT(
+            hidden_size,
+            factor=3,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.hybrid_seq_parallel_attn = None
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def forward(
+        self,
+        x: torch.Tensor,
+        vec: torch.Tensor,
+        txt_len: int,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    ) -> torch.Tensor:
+        mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
+        x_mod = modulate(self.pre_norm(x), shift=mod_shift, scale=mod_scale)
+        qkv, mlp = torch.split(
+            self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1
+        )
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        # Apply QK-Norm if needed.
+        q = self.q_norm(q).to(v)
+        k = self.k_norm(k).to(v)
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
+            img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
+            img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_qq.shape == img_q.shape and img_kk.shape == img_k.shape
+            ), f"img_kk: {img_qq.shape}, img_q: {img_q.shape}, img_kk: {img_kk.shape}, img_k: {img_k.shape}"
+            img_q, img_k = img_qq, img_kk
+            q = torch.cat((img_q, txt_q), dim=1)
+            k = torch.cat((img_k, txt_k), dim=1)
+        # Compute attention.
+        assert (
+            cu_seqlens_q.shape[0] == 2 * x.shape[0] + 1
+        ), f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, x.shape[0]:{x.shape[0]}"
+        # attention computation start
+        if not self.hybrid_seq_parallel_attn:
+            attn = attention(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                batch_size=x.shape[0],
+            )
+        else:
+            attn = parallel_attention(
+                self.hybrid_seq_parallel_attn,
+                q,
+                k,
+                v,
+                img_q_len=img_q.shape[1],
+                img_kv_len=img_k.shape[1],
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv
+            )
+        # attention computation end
+        # Compute activation in mlp stream, cat again and run second linear layer.
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + apply_gate(output, gate=mod_gate)
+class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
+    """
+    HunyuanVideo Transformer backbone
+    Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline.
+    Reference:
+    [1] Flux.1: https://github.com/black-forest-labs/flux
+    [2] MMDiT: http://arxiv.org/abs/2403.03206
+    Parameters
+    ----------
+    args: argparse.Namespace
+        The arguments parsed by argparse.
+    patch_size: list
+        The size of the patch.
+    in_channels: int
+        The number of input channels.
+    out_channels: int
+        The number of output channels.
+    hidden_size: int
+        The hidden size of the transformer backbone.
+    heads_num: int
+        The number of attention heads.
+    mlp_width_ratio: float
+        The ratio of the hidden size of the MLP in the transformer block.
+    mlp_act_type: str
+        The activation function of the MLP in the transformer block.
+    depth_double_blocks: int
+        The number of transformer blocks in the double blocks.
+    depth_single_blocks: int
+        The number of transformer blocks in the single blocks.
+    rope_dim_list: list
+        The dimension of the rotary embedding for t, h, w.
+    qkv_bias: bool
+        Whether to use bias in the qkv linear layer.
+    qk_norm: bool
+        Whether to use qk norm.
+    qk_norm_type: str
+        The type of qk norm.
+    guidance_embed: bool
+        Whether to use guidance embedding for distillation.
+    text_projection: str
+        The type of the text projection, default is single_refiner.
+    use_attention_mask: bool
+        Whether to use attention mask for text encoder.
+    dtype: torch.dtype
+        The dtype of the model.
+    device: torch.device
+        The device of the model.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        args: Any,
+        patch_size: list = [1, 2, 2],
+        in_channels: int = 4,  # Should be VAE.config.latent_channels.
+        out_channels: int = None,
+        hidden_size: int = 3072,
+        heads_num: int = 24,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        mm_double_blocks_depth: int = 20,
+        mm_single_blocks_depth: int = 40,
+        rope_dim_list: List[int] = [16, 56, 56],
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        guidance_embed: bool = False,  # For modulation.
+        text_projection: str = "single_refiner",
+        use_attention_mask: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.unpatchify_channels = self.out_channels
+        self.guidance_embed = guidance_embed
+        self.rope_dim_list = rope_dim_list
+        # Text projection. Default to linear projection.
+        # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
+        self.use_attention_mask = use_attention_mask
+        self.text_projection = text_projection
+        self.text_states_dim = args.text_states_dim
+        self.text_states_dim_2 = args.text_states_dim_2
+        if hidden_size % heads_num != 0:
+            raise ValueError(
+                f"Hidden size {hidden_size} must be divisible by heads_num {heads_num}"
+            )
+        pe_dim = hidden_size // heads_num
+        if sum(rope_dim_list) != pe_dim:
+            raise ValueError(
+                f"Got {rope_dim_list} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        # image projection
+        self.img_in = PatchEmbed(
+            self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs
+        )
+        # text projection
+        if self.text_projection == "linear":
+            self.txt_in = TextProjection(
+                self.text_states_dim,
+                self.hidden_size,
+                get_activation_layer("silu"),
+                **factory_kwargs,
+            )
+        elif self.text_projection == "single_refiner":
+            self.txt_in = SingleTokenRefiner(
+                self.text_states_dim, hidden_size, heads_num, depth=2, **factory_kwargs
+            )
+        else:
+            raise NotImplementedError(
+                f"Unsupported text_projection: {self.text_projection}"
+            )
+        # time modulation
+        self.time_in = TimestepEmbedder(
+            self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+        )
+        # text modulation
+        self.vector_in = MLPEmbedder(
+            self.text_states_dim_2, self.hidden_size, **factory_kwargs
+        )
+        # guidance modulation
+        self.guidance_in = (
+            TimestepEmbedder(
+                self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+            )
+            if guidance_embed
+            else None
+        )
+        # double blocks
+        self.double_blocks = nn.ModuleList(
+            [
+                MMDoubleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_double_blocks_depth)
+            ]
+        )
+        # single blocks
+        self.single_blocks = nn.ModuleList(
+            [
+                MMSingleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_single_blocks_depth)
+            ]
+        )
+        self.final_layer = FinalLayer(
+            self.hidden_size,
+            self.patch_size,
+            self.out_channels,
+            get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+    def enable_deterministic(self):
+        for block in self.double_blocks:
+            block.enable_deterministic()
+        for block in self.single_blocks:
+            block.enable_deterministic()
+    def disable_deterministic(self):
+        for block in self.double_blocks:
+            block.disable_deterministic()
+        for block in self.single_blocks:
+            block.disable_deterministic()
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,  # Should be in range(0, 1000).
+        text_states: torch.Tensor = None,
+        text_mask: torch.Tensor = None,  # Now we don't use it.
+        text_states_2: Optional[torch.Tensor] = None,  # Text embedding for modulation.
+        freqs_cos: Optional[torch.Tensor] = None,
+        freqs_sin: Optional[torch.Tensor] = None,
+        guidance: torch.Tensor = None,  # Guidance for modulation, should be cfg_scale x 1000.
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        out = {}
+        img = x
+        txt = text_states
+        _, _, ot, oh, ow = x.shape
+        tt, th, tw = (
+            ot // self.patch_size[0],
+            oh // self.patch_size[1],
+            ow // self.patch_size[2],
+        )
+        # Prepare modulation vectors.
+        vec = self.time_in(t)
+        # text modulation
+        vec = vec + self.vector_in(text_states_2)
+        # guidance modulation
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
+            vec = vec + self.guidance_in(guidance)
+        # Embed image and text.
+        img = self.img_in(img)
+        if self.text_projection == "linear":
+            txt = self.txt_in(txt)
+        elif self.text_projection == "single_refiner":
+            txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
+        else:
+            raise NotImplementedError(
+                f"Unsupported text_projection: {self.text_projection}"
+            )
+        txt_seq_len = txt.shape[1]
+        img_seq_len = img.shape[1]
+        # Compute cu_squlens and max_seqlen for flash attention
+        cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len)
+        cu_seqlens_kv = cu_seqlens_q
+        max_seqlen_q = img_seq_len + txt_seq_len
+        max_seqlen_kv = max_seqlen_q
+        freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+        # --------------------- Pass through DiT blocks ------------------------
+        for _, block in enumerate(self.double_blocks):
+            double_block_args = [
+                img,
+                txt,
+                vec,
+                cu_seqlens_q,
+                cu_seqlens_kv,
+                max_seqlen_q,
+                max_seqlen_kv,
+                freqs_cis,
+            ]
+            img, txt = block(*double_block_args)
+        # Merge txt and img to pass through single stream blocks.
+        x = torch.cat((img, txt), 1)
+        if len(self.single_blocks) > 0:
+            for _, block in enumerate(self.single_blocks):
+                single_block_args = [
+                    x,
+                    vec,
+                    txt_seq_len,
+                    cu_seqlens_q,
+                    cu_seqlens_kv,
+                    max_seqlen_q,
+                    max_seqlen_kv,
+                    (freqs_cos, freqs_sin),
+                ]
+                x = block(*single_block_args)
+        img = x[:, :img_seq_len, ...]
+        # ---------------------------- Final layer ------------------------------
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.unpatchify(img, tt, th, tw)
+        if return_dict:
+            out["x"] = img
+            return out
+        return img
+    def unpatchify(self, x, t, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.unpatchify_channels
+        pt, ph, pw = self.patch_size
+        assert t * h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw))
+        x = torch.einsum("nthwcopq->nctohpwq", x)
+        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
+        return imgs
+    def params_count(self):
+        counts = {
+            "double": sum(
+                [
+                    sum(p.numel() for p in block.img_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.img_attn_proj.parameters())
+                    + sum(p.numel() for p in block.img_mlp.parameters())
+                    + sum(p.numel() for p in block.txt_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.txt_attn_proj.parameters())
+                    + sum(p.numel() for p in block.txt_mlp.parameters())
+                    for block in self.double_blocks
+                ]
+            ),
+            "single": sum(
+                [
+                    sum(p.numel() for p in block.linear1.parameters())
+                    + sum(p.numel() for p in block.linear2.parameters())
+                    for block in self.single_blocks
+                ]
+            ),
+            "total": sum(p.numel() for p in self.parameters()),
+        }
+        counts["attn+mlp"] = counts["double"] + counts["single"]
+        return counts
+#################################################################################
+#                             HunyuanVideo Configs                              #
+#################################################################################
+HUNYUAN_VIDEO_CONFIG = {
+    "HYVideo-T/2": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+    },
+    "HYVideo-T/2-cfgdistill": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        "guidance_embed": True,
+    },
+}

hyvideo/modules/placement.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import torch
+import triton
+import triton.language as tl
+def hunyuan_token_reorder_to_token_major(tensor, fix_len, reorder_len, reorder_num_frame, frame_size):
+    """Reorder it from frame major to token major!"""
+    assert reorder_len == reorder_num_frame * frame_size
+    assert tensor.shape[2] == fix_len + reorder_len
+    tensor[:, :, :-fix_len, :] = tensor[:, :, :-fix_len:, :].reshape(tensor.shape[0], tensor.shape[1], reorder_num_frame, frame_size, tensor.shape[3]) \
+                                                         .transpose(2, 3).reshape(tensor.shape[0], tensor.shape[1], reorder_len, tensor.shape[3])
+    return tensor
+def hunyuan_token_reorder_to_frame_major(tensor, fix_len, reorder_len, reorder_num_frame, frame_size):
+    """Reorder it from token major to frame major!"""
+    assert reorder_len == reorder_num_frame * frame_size
+    assert tensor.shape[2] == fix_len + reorder_len
+    tensor[:, :, :-fix_len:, :] = tensor[:, :, :-fix_len:, :].reshape(tensor.shape[0], tensor.shape[1], frame_size, reorder_num_frame, tensor.shape[3]) \
+                                                         .transpose(2, 3).reshape(tensor.shape[0], tensor.shape[1], reorder_len, tensor.shape[3])
+    return tensor
+@triton.jit
+def hunyuan_sparse_head_placement_kernel(
+    query_ptr, key_ptr, value_ptr, # [cfg, num_heads, seq_len, head_dim] seq_len = context_length + num_frame * frame_size
+    query_out_ptr, key_out_ptr, value_out_ptr, # [cfg, num_heads, seq_len, head_dim]
+    best_mask_idx_ptr, # [cfg, num_heads]
+    query_stride_b, query_stride_h, query_stride_s, query_stride_d,
+    mask_idx_stride_b, mask_idx_stride_h,
+    seq_len: tl.constexpr,
+    head_dim: tl.constexpr,
+    context_length: tl.constexpr,
+    num_frame: tl.constexpr,
+    frame_size: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr
+):
+    # Copy query, key, value to output
+    # range: [b, h, block_id * block_size: block_id * block_size + block_size, :]
+    cfg = tl.program_id(0)
+    head = tl.program_id(1)
+    block_id = tl.program_id(2)
+    start_id = block_id * BLOCK_SIZE
+    end_id = start_id + BLOCK_SIZE
+    end_id = tl.where(end_id > seq_len, seq_len, end_id)
+    # Load best mask idx (0 is spatial, 1 is temporal)
+    is_temporal = tl.load(best_mask_idx_ptr + cfg * mask_idx_stride_b + head * mask_idx_stride_h)
+    offset_token = tl.arange(0, BLOCK_SIZE) + start_id
+    offset_mask = offset_token < seq_len
+    offset_d = tl.arange(0, head_dim)
+    if is_temporal:
+        frame_id = offset_token // frame_size
+        patch_id = offset_token - frame_id * frame_size
+        offset_store_token = tl.where(offset_token >= seq_len - context_length, offset_token, patch_id * num_frame + frame_id)
+        offset_load = (cfg * query_stride_b + head * query_stride_h + offset_token[:,None] * query_stride_s) + offset_d[None,:] * query_stride_d
+        offset_query = query_ptr + offset_load
+        offset_key = key_ptr + offset_load
+        offset_value = value_ptr + offset_load
+        offset_store = (cfg * query_stride_b + head * query_stride_h + offset_store_token[:,None] * query_stride_s) + offset_d[None,:] * query_stride_d
+        offset_query_out = query_out_ptr + offset_store
+        offset_key_out = key_out_ptr + offset_store
+        offset_value_out = value_out_ptr + offset_store
+        # Maybe tune the pipeline here
+        query = tl.load(offset_query, mask=offset_mask[:,None])
+        tl.store(offset_query_out, query, mask=offset_mask[:,None])
+        key = tl.load(offset_key, mask=offset_mask[:,None])
+        tl.store(offset_key_out, key, mask=offset_mask[:,None])
+        value = tl.load(offset_value, mask=offset_mask[:,None])
+        tl.store(offset_value_out, value, mask=offset_mask[:,None])
+    else:
+        offset_load = (cfg * query_stride_b + head * query_stride_h + offset_token[:,None] * query_stride_s) + offset_d[None,:] * query_stride_d
+        offset_query = query_ptr + offset_load
+        offset_key = key_ptr + offset_load
+        offset_value = value_ptr + offset_load
+        offset_store = offset_load
+        offset_query_out = query_out_ptr + offset_store
+        offset_key_out = key_out_ptr + offset_store
+        offset_value_out = value_out_ptr + offset_store
+        # Maybe tune the pipeline here
+        query = tl.load(offset_query, mask=offset_mask[:,None])
+        tl.store(offset_query_out, query, mask=offset_mask[:,None])
+        key = tl.load(offset_key, mask=offset_mask[:,None])
+        tl.store(offset_key_out, key, mask=offset_mask[:,None])
+        value = tl.load(offset_value, mask=offset_mask[:,None])
+        tl.store(offset_value_out, value, mask=offset_mask[:,None])
+def hunyuan_sparse_head_placement(query, key, value, query_out, key_out, value_out, best_mask_idx, context_length, num_frame, frame_size):
+    cfg, num_heads, seq_len, head_dim = query.shape
+    BLOCK_SIZE = 128
+    assert seq_len == context_length + num_frame * frame_size
+    grid = (cfg, num_heads, (seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
+    hunyuan_sparse_head_placement_kernel[grid](
+        query, key, value,
+        query_out, key_out, value_out,
+        best_mask_idx,
+        query.stride(0), query.stride(1), query.stride(2), query.stride(3),
+        best_mask_idx.stride(0), best_mask_idx.stride(1),
+        seq_len, head_dim, context_length, num_frame, frame_size,
+        BLOCK_SIZE
+    )
+def ref_hunyuan_sparse_head_placement(query, key, value, best_mask_idx, context_length, num_frame, frame_size):
+    cfg, num_heads, seq_len, head_dim = query.shape
+    assert seq_len == context_length + num_frame * frame_size
+    query_out = query.clone()
+    key_out = key.clone()
+    value_out = value.clone()
+    # Spatial
+    query_out[best_mask_idx == 0], key_out[best_mask_idx == 0], value_out[best_mask_idx == 0] = \
+        query[best_mask_idx == 0], key[best_mask_idx == 0], value[best_mask_idx == 0]
+    # Temporal
+    query_out[best_mask_idx == 1], key_out[best_mask_idx == 1], value_out[best_mask_idx == 1] = \
+            hunyuan_token_reorder_to_token_major(query[best_mask_idx == 1].unsqueeze(0), context_length, num_frame * frame_size, num_frame, frame_size).squeeze(0), \
+            hunyuan_token_reorder_to_token_major(key[best_mask_idx == 1].unsqueeze(0), context_length, num_frame * frame_size, num_frame, frame_size).squeeze(0), \
+            hunyuan_token_reorder_to_token_major(value[best_mask_idx == 1].unsqueeze(0), context_length, num_frame * frame_size, num_frame, frame_size).squeeze(0)
+    return query_out, key_out, value_out
+def test_hunyuan_sparse_head_placement():
+    context_length = 226
+    num_frame = 11
+    frame_size = 4080
+    cfg = 2
+    num_heads = 48
+    seq_len = context_length + num_frame * frame_size
+    head_dim = 64
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    query = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+    key = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+    value = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+    best_mask_idx = torch.randint(0, 2, (cfg, num_heads), device=device)
+    query_out = torch.empty_like(query)
+    key_out = torch.empty_like(key)
+    value_out = torch.empty_like(value)
+    hunyuan_sparse_head_placement(query, key, value, query_out, key_out, value_out, best_mask_idx, context_length, num_frame, frame_size)
+    ref_query_out, ref_key_out, ref_value_out = ref_hunyuan_sparse_head_placement(query, key, value, best_mask_idx, context_length, num_frame, frame_size)
+    torch.testing.assert_close(query_out, ref_query_out)
+    torch.testing.assert_close(key_out, ref_key_out)
+    torch.testing.assert_close(value_out, ref_value_out)
+def benchmark_hunyuan_sparse_head_placement():
+    import time
+    context_length = 226
+    num_frame = 11
+    frame_size = 4080
+    cfg = 2
+    num_heads = 48
+    seq_len = context_length + num_frame * frame_size
+    head_dim = 64
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    query = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+    key = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+    value = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+    best_mask_idx = torch.randint(0, 2, (cfg, num_heads), device=device)
+    query_out = torch.empty_like(query)
+    key_out = torch.empty_like(key)
+    value_out = torch.empty_like(value)
+    warmup = 10
+    all_iter = 1000
+    # warmup
+    for _ in range(warmup):
+        hunyuan_sparse_head_placement(query, key, value, query_out, key_out, value_out, best_mask_idx, context_length, num_frame, frame_size)
+    torch.cuda.synchronize()
+    start = time.time()
+    for _ in range(all_iter):
+        hunyuan_sparse_head_placement(query, key, value, query_out, key_out, value_out, best_mask_idx, context_length, num_frame, frame_size)
+    torch.cuda.synchronize()
+    end = time.time()
+    print(f"Triton Elapsed Time: {(end - start) / all_iter * 1e3:.2f} ms")
+    print(f"Triton Total Bandwidth: {query.nelement() * query.element_size() * 3 * 2 * all_iter / (end - start) / 1e9:.2f} GB/s")
+    torch.cuda.synchronize()
+    start = time.time()
+    for _ in range(all_iter):
+        ref_hunyuan_sparse_head_placement(query, key, value, best_mask_idx, context_length, num_frame, frame_size)
+    torch.cuda.synchronize()
+    end = time.time()
+    print(f"Reference Elapsed Time: {(end - start) / all_iter * 1e3:.2f} ms")
+    print(f"Reference Total Bandwidth: {query.nelement() * query.element_size() * 3 * 2 * all_iter / (end - start) / 1e9:.2f} GB/s")
+@triton.jit
+def hunyuan_hidden_states_placement_kernel(
+    hidden_states_ptr, # [cfg, num_heads, seq_len, head_dim] seq_len = context_length + num_frame * frame_size
+    hidden_states_out_ptr, # [cfg, num_heads, seq_len, head_dim]
+    best_mask_idx_ptr, # [cfg, num_heads]
+    hidden_states_stride_b, hidden_states_stride_h, hidden_states_stride_s, hidden_states_stride_d,
+    mask_idx_stride_b, mask_idx_stride_h,
+    seq_len: tl.constexpr,
+    head_dim: tl.constexpr,
+    context_length: tl.constexpr,
+    num_frame: tl.constexpr,
+    frame_size: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr
+):
+    # Copy hidden_states to output
+    # range: [b, h, block_id * block_size: block_id * block_size + block_size, :]
+    cfg = tl.program_id(0)
+    head = tl.program_id(1)
+    block_id = tl.program_id(2)
+    start_id = block_id * BLOCK_SIZE
+    end_id = start_id + BLOCK_SIZE
+    end_id = tl.where(end_id > seq_len, seq_len, end_id)
+    # Load best mask idx (0 is spatial, 1 is temporal)
+    is_temporal = tl.load(best_mask_idx_ptr + cfg * mask_idx_stride_b + head * mask_idx_stride_h)
+    offset_token = tl.arange(0, BLOCK_SIZE) + start_id
+    offset_mask = offset_token < seq_len
+    offset_d = tl.arange(0, head_dim)
+    if is_temporal:
+        patch_id = offset_token // num_frame
+        frame_id = offset_token - patch_id * num_frame
+        offset_store_token = tl.where(offset_token >= seq_len - context_length, offset_token, frame_id * frame_size + patch_id)
+        offset_load = (cfg * hidden_states_stride_b + head * hidden_states_stride_h + offset_token[:,None] * hidden_states_stride_s) + offset_d[None,:] * hidden_states_stride_d
+        offset_hidden_states = hidden_states_ptr + offset_load
+        offset_store = (cfg * hidden_states_stride_b + head * hidden_states_stride_h + offset_store_token[:,None] * hidden_states_stride_s) + offset_d[None,:] * hidden_states_stride_d
+        offset_hidden_states_out = hidden_states_out_ptr + offset_store
+        # Maybe tune the pipeline here
+        hidden_states = tl.load(offset_hidden_states, mask=offset_mask[:,None])
+        tl.store(offset_hidden_states_out, hidden_states, mask=offset_mask[:,None])
+    else:
+        offset_load = (cfg * hidden_states_stride_b + head * hidden_states_stride_h + offset_token[:,None] * hidden_states_stride_s) + offset_d[None,:] * hidden_states_stride_d
+        offset_hidden_states = hidden_states_ptr + offset_load
+        offset_store = offset_load
+        offset_hidden_states_out = hidden_states_out_ptr + offset_store
+        # Maybe tune the pipeline here
+        hidden_states = tl.load(offset_hidden_states, mask=offset_mask[:,None])
+        tl.store(offset_hidden_states_out, hidden_states, mask=offset_mask[:,None])
+def hunyuan_hidden_states_placement(hidden_states, hidden_states_out, best_mask_idx, context_length, num_frame, frame_size):
+    cfg, num_heads, seq_len, head_dim = hidden_states.shape
+    BLOCK_SIZE = 128
+    assert seq_len == context_length + num_frame * frame_size
+    grid = (cfg, num_heads, (seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
+    hunyuan_hidden_states_placement_kernel[grid](
+        hidden_states,
+        hidden_states_out,
+        best_mask_idx,
+        hidden_states.stride(0), hidden_states.stride(1), hidden_states.stride(2), hidden_states.stride(3),
+        best_mask_idx.stride(0), best_mask_idx.stride(1),
+        seq_len, head_dim, context_length, num_frame, frame_size,
+        BLOCK_SIZE
+    )
+    return hidden_states_out
+def ref_hunyuan_hidden_states_placement(hidden_states, output_hidden_states, best_mask_idx, context_length, num_frame, frame_size):
+    cfg, num_heads, seq_len, head_dim = hidden_states.shape
+    assert seq_len == context_length + num_frame * frame_size
+    # Spatial
+    output_hidden_states[best_mask_idx == 0] = hidden_states[best_mask_idx == 0]
+    # Temporal
+    output_hidden_states[best_mask_idx == 1] = hunyuan_token_reorder_to_frame_major(hidden_states[best_mask_idx == 1].unsqueeze(0), context_length, num_frame * frame_size, num_frame, frame_size).squeeze(0)
+def test_hunyuan_hidden_states_placement():
+    context_length = 226
+    num_frame = 11
+    frame_size = 4080
+    cfg = 2
+    num_heads = 48
+    seq_len = context_length + num_frame * frame_size
+    head_dim = 64
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    hidden_states = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+    best_mask_idx = torch.randint(0, 2, (cfg, num_heads), device=device)
+    hidden_states_out1 = torch.empty_like(hidden_states)
+    hidden_states_out2 = torch.empty_like(hidden_states)
+    hunyuan_hidden_states_placement(hidden_states, hidden_states_out1, best_mask_idx, context_length, num_frame, frame_size)
+    ref_hunyuan_hidden_states_placement(hidden_states, hidden_states_out2, best_mask_idx, context_length, num_frame, frame_size)
+    torch.testing.assert_close(hidden_states_out1, hidden_states_out2)
+def benchmark_hunyuan_hidden_states_placement():
+    import time
+    context_length = 226
+    num_frame = 11
+    frame_size = 4080
+    cfg = 2
+    num_heads = 48
+    seq_len = context_length + num_frame * frame_size
+    head_dim = 64
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    hidden_states = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+    best_mask_idx = torch.randint(0, 2, (cfg, num_heads), device=device)
+    hidden_states_out = torch.empty_like(hidden_states)
+    warmup = 10
+    all_iter = 1000
+    # warmup
+    for _ in range(warmup):
+        hunyuan_hidden_states_placement(hidden_states, hidden_states_out, best_mask_idx, context_length, num_frame, frame_size)
+    torch.cuda.synchronize()
+    start = time.time()
+    for _ in range(all_iter):
+        hunyuan_hidden_states_placement(hidden_states, hidden_states_out, best_mask_idx, context_length, num_frame, frame_size)
+    torch.cuda.synchronize()
+    end = time.time()
+    print(f"Triton Elapsed Time: {(end - start) / all_iter * 1e3:.2f} ms")
+    print(f"Triton Total Bandwidth: {hidden_states.nelement() * hidden_states.element_size() * 2 * all_iter / (end - start) / 1e9:.2f} GB/s")
+    torch.cuda.synchronize()
+    start = time.time()
+    for _ in range(all_iter):
+        ref_hunyuan_hidden_states_placement(hidden_states, hidden_states.clone(), best_mask_idx, context_length, num_frame, frame_size)
+    torch.cuda.synchronize()
+    end = time.time()
+    print(f"Reference Elapsed Time: {(end - start) / all_iter * 1e3:.2f} ms")
+    print(f"Reference Total Bandwidth: {hidden_states.nelement() * hidden_states.element_size() * 2 * all_iter / (end - start) / 1e9:.2f} GB/s")
+if __name__ == "__main__":
+    test_hunyuan_sparse_head_placement()
+    benchmark_hunyuan_sparse_head_placement()
+    test_hunyuan_hidden_states_placement()
+    benchmark_hunyuan_hidden_states_placement()