Spaces:

ifire
/

painting-undo

Runtime error

App Files Files Community

lllyasviel commited on Jul 7, 2024

Commit

06fccba

0 Parent(s):

i

Browse files

Files changed (23) hide show

.gitignore +168 -0
LICENSE +201 -0
README.md +111 -0
diffusers_helper/cat_cond.py +24 -0
diffusers_helper/code_cond.py +34 -0
diffusers_helper/k_diffusion.py +145 -0
diffusers_helper/utils.py +136 -0
diffusers_vdm/attention.py +385 -0
diffusers_vdm/basics.py +148 -0
diffusers_vdm/dynamic_tsnr_sampler.py +177 -0
diffusers_vdm/improved_clip_vision.py +58 -0
diffusers_vdm/pipeline.py +188 -0
diffusers_vdm/projection.py +160 -0
diffusers_vdm/unet.py +650 -0
diffusers_vdm/utils.py +43 -0
diffusers_vdm/vae.py +826 -0
gradio_app.py +321 -0
imgs/1.jpg +0 -0
imgs/2.jpg +0 -0
imgs/3.jpg +0 -0
memory_management.py +67 -0
requirements.txt +16 -0
wd14tagger.py +105 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+hf_token.txt
+hf_download/
+results/
+*.csv
+*.onnx
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# Paints-Undo
+PaintsUndo: A Base Model of Drawing Behaviors in Digital Paintings
+Paints-Undo is a project aimed at providing base models of human drawing behaviors with a hope that future AI models can better align with the real needs of human artists.
+The name "Paints-Undo" is inspired by the similarity that, the model's outputs look like pressing the "undo" button (usually Ctrl+Z) many times in digital painting software.
+Paints-Undo presents a family of models that take an image as input and then output the drawing sequence of that image. The model displays all kinds of human behaviors, including but not limited to sketching, inking, coloring, shading, transforming, left-right flipping, color curve tuning, changing the visibility of layers, and even changing the overall idea during the drawing process.
+**This page does not contain any examples. All examples are in the below Git page:**
+[>>> Click Here to See the Example Page <<<](https://lllyasviel.github.io/pages/paints_undo/)
+# Get Started
+You can deploy PaintsUndo locally via:
+    git clone https://github.com/lllyasviel/Paints-UNDO.git
+    cd Paints-UNDO
+    conda create -n paints_undo python=3.10
+    conda activate paints_undo
+    pip install xformers
+    pip install -r requirements.txt
+    python gradio_app.py
+(If you do not know how to use these commands, you can paste those commands to ChatGPT and ask ChatGPT to explain and give more detailed instructions.)
+The inference is tested with 24GB VRAM on Nvidia 4090 and 3090TI. It may also work with 16GB VRAM, but does not work with 8GB. My estimation is that, under extreme optimization (including weight offloading and sliced attention), the theoretical minimal VRAM requirement is about 10~12.5 GB.
+You can expect to process one image in about 5 to 10 minutes, depending on your settings. As a typical result, you will get a video of 25 seconds at FPS 4, with resolution 320x512, or 512x320, or 384x448, or 448x384.
+Because the processing time, in most cases, is significantly longer than most tasks/quota in HuggingFace Space, I personally do not highly recommend to deploy this to HuggingFace Space, to avoid placing an unnecessary burden on the HF servers.
+If you do not have required computation devices and still wants an online solution, one option is to wait us to release a Colab notebook (but I am not sure if Colab free tier will work).
+# Model Notes
+We currently release two models `paints_undo_single_frame` and `paints_undo_multi_frame`. Let's call them single-frame model and multi-frame model.
+The single-frame model takes one image and an `operation step` as input, and outputs one single image. Assuming that an artwork can always be created with 1000 human operations (for example, one brush stroke is one operation), and the `operation step` is an int number from 0 to 999. The number 0 is the finished final artwork, and the number 999 is the first brush stroke drawn on the pure white canvas. You can understand this model as an "undo" (or called Ctrl+Z) model. You input the final image, and indicate how many times you want to "Ctrl+Z", and the model will give you a "simulated" screenshot after those "Ctrl+Z"s are pressed. If your `operation step` is 100, then it means you want to simulate "Ctrl+Z" 100 times on this image get the appearance after the 100-th "Ctrl+Z".
+The multi-frame model takes two images as inputs and output 16 intermediate frames between the two input images. The result is much more consistent than the single-frame model, but also much slower, less "creative", and limited in 16 frames.
+In this repo, the default method is to use them together. We will first infer the single-frame model about 5-7 times to get 5-7 "keyframes", and then we use the multi-frame model to "interpolate" those keyframes to actually generate a relatively long video.
+In theory this system can be used in many ways and even give infinitely long video, but in practice results are good when the final frame count is about 100-500.
+### Model Architecture (paints_undo_single_frame)
+The model is a modified architecture of SD1.5 trained on different betas scheduler, clip skip, and the aforementioned `operation step` condition. To be specific, the model is trained with the betas of:
+`betas = torch.linspace(0.00085, 0.020, 1000, dtype=torch.float64)`
+For comparison, the original SD1.5 is trained with the betas of:
+`betas = torch.linspace(0.00085 ** 0.5, 0.012 ** 0.5, 1000, dtype=torch.float64) ** 2`
+You can notice the difference in the ending betas and the removed square. The choice of this scheduler is based on our internal user study.
+The last layer of the text encoder CLIP ViT-L/14 is permanently removed. It is now mathematically consistent to always set CLIP Skip to 2 (if you use diffusers).
+The `operation step` condition is added to layer embeddings in a way similar to SDXL's extra embeddings.
+Also, since the solo purpose of this model is to process existing images, the model is strictly aligned with WD14 tagger without any other augmentations. You should always use WD14 tagger (the one in this repo) to process the input image to get the prompt. Otherwise, the results may be defective. Human-written prompts are not tested.
+### Model Architecture (paints_undo_multi_frame)
+This model is trained by resuming from [VideoCrafter](https://github.com/AILab-CVC/VideoCrafter) family, but the original Crafter's `lvdm` is not used and all training/inference codes are completely implemented from scratch. (BTW, now the codes are based on modern Diffusers.) Although the initial weights are resumed from VideoCrafter, the topology of neural network is modified a lot, and the network behavior is now largely different from original Crafter after extensive training.
+The overall architecture is like Crafter with 5 components, 3D-UNet, VAE, CLIP, CLIP-Vision, Image Projection.
+**VAE**: The VAE is the exactly same anime VAE extracted from [ToonCrafter](https://github.com/ToonCrafter/ToonCrafter). Thanks ToonCrafter a lot for providing the excellent anime temporal VAE for Crafters.
+**3D-UNet**: The 3D-UNet is modified from Crafters's `lvdm` with revisions to attention modules. Other than some minor changes in codes, the major change is that now the UNet are trained and supports temporal windows in Spatial Self Attention layers. You can change the codes in `diffusers_vdm.attention.CrossAttention.temporal_window_for_spatial_self_attention` and `temporal_window_type` to activate three types of attention windows:
+1. "prv" mode: Each frame's Spatial Self-Attention also attend to full spatial contexts of its previous frame. The first frame only attend itself.
+2. "first": Each frame's Spatial Self-Attention also attend to full spatial contexts of the first frame of the entire sequence. The first frame only attend its self.
+3. "roll": Each frame's Spatial Self-Attention also attend to full spatial contexts of its previous and next frames, based on the ordering of `torch.roll`.
+Note that this is by default disabled in inference to save GPU memory.
+**CLIP**: The CLIP of SD2.1.
+**CLIP-Vision**: Our implementation of Clip Vision (ViT/H) that supports arbitrary aspect ratios by interpolating the positional embedding. After experimenting with linear interpolation, nearest neighbor, and Rotary Positional Encoding (RoPE), our final choice is nearest neighbor. Note that this is different from Crafter methods that resize or center-crop images to 224x224.
+**Image Projection**: Our implementation of a tiny transformer that takes two frames as inputs and outputs 16 image embeddings for each frame. Note that this is different from Crafter methods that only use one image.
+# Tutorial
+After you get into the Gradio interface:
+Step 0: Upload an image or just click an Example image on the bottom of the page.
+Step 1: In the UI titled "step 1", click generate prompts to get the global prompt.
+Step 2: In the UI titled "step 2", click "Generate Key Frames". You can change seeds or other parameters on the left.
+Step 3: In the UI titled "step 3", click "Generate Video". You can change seeds or other parameters on the left.
+# Cite
+    @Misc{paintsundo,
+      author = {Paints-Undo Team},
+      title  = {Paints-Undo GitHub Page},
+      year   = {2024},
+    }
+# Disclaimer
+This project aims to develop base models of human drawing behaviors, facilitating future AI systems to better meet the real needs of human artists. Users are granted the freedom to create content using this tool, but they are expected to comply with local laws and use it responsibly. Users must not employ the tool to generate false information or incite confrontation. The developers do not assume any responsibility for potential misuse by users.

diffusers_helper/cat_cond.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+def unet_add_concat_conds(unet, new_channels=4):
+    with torch.no_grad():
+        new_conv_in = torch.nn.Conv2d(4 + new_channels, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
+        new_conv_in.weight.zero_()
+        new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
+        new_conv_in.bias = unet.conv_in.bias
+        unet.conv_in = new_conv_in
+    unet_original_forward = unet.forward
+    def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs):
+        cross_attention_kwargs = {k: v for k, v in kwargs['cross_attention_kwargs'].items()}
+        c_concat = cross_attention_kwargs.pop('concat_conds')
+        kwargs['cross_attention_kwargs'] = cross_attention_kwargs
+        c_concat = torch.cat([c_concat] * (sample.shape[0] // c_concat.shape[0]), dim=0).to(sample)
+        new_sample = torch.cat([sample, c_concat], dim=1)
+        return unet_original_forward(new_sample, timestep, encoder_hidden_states, **kwargs)
+    unet.forward = hooked_unet_forward
+    return

diffusers_helper/code_cond.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+def unet_add_coded_conds(unet, added_number_count=1):
+    unet.add_time_proj = Timesteps(256, True, 0)
+    unet.add_embedding = TimestepEmbedding(256 * added_number_count, 1280)
+    def get_aug_embed(emb, encoder_hidden_states, added_cond_kwargs):
+        coded_conds = added_cond_kwargs.get("coded_conds")
+        batch_size = coded_conds.shape[0]
+        time_embeds = unet.add_time_proj(coded_conds.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb)
+        aug_emb = unet.add_embedding(time_embeds)
+        return aug_emb
+    unet.get_aug_embed = get_aug_embed
+    unet_original_forward = unet.forward
+    def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs):
+        cross_attention_kwargs = {k: v for k, v in kwargs['cross_attention_kwargs'].items()}
+        coded_conds = cross_attention_kwargs.pop('coded_conds')
+        kwargs['cross_attention_kwargs'] = cross_attention_kwargs
+        coded_conds = torch.cat([coded_conds] * (sample.shape[0] // coded_conds.shape[0]), dim=0).to(sample.device)
+        kwargs['added_cond_kwargs'] = dict(coded_conds=coded_conds)
+        return unet_original_forward(sample, timestep, encoder_hidden_states, **kwargs)
+    unet.forward = hooked_unet_forward
+    return

diffusers_helper/k_diffusion.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import torch
+import numpy as np
+from tqdm import tqdm
+@torch.no_grad()
+def sample_dpmpp_2m(model, x, sigmas, extra_args=None, callback=None, progress_tqdm=None):
+    """DPM-Solver++(2M)."""
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    sigma_fn = lambda t: t.neg().exp()
+    t_fn = lambda sigma: sigma.log().neg()
+    old_denoised = None
+    bar = tqdm if progress_tqdm is None else progress_tqdm
+    for i in bar(range(len(sigmas) - 1)):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
+        h = t_next - t
+        if old_denoised is None or sigmas[i + 1] == 0:
+            x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised
+        else:
+            h_last = t - t_fn(sigmas[i - 1])
+            r = h_last / h
+            denoised_d = (1 + 1 / (2 * r)) * denoised - (1 / (2 * r)) * old_denoised
+            x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised_d
+        old_denoised = denoised
+    return x
+class KModel:
+    def __init__(self, unet, timesteps=1000, linear_start=0.00085, linear_end=0.012, linear=False):
+        if linear:
+            betas = torch.linspace(linear_start, linear_end, timesteps, dtype=torch.float64)
+        else:
+            betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, timesteps, dtype=torch.float64) ** 2
+        alphas = 1. - betas
+        alphas_cumprod = torch.tensor(np.cumprod(alphas, axis=0), dtype=torch.float32)
+        self.sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        self.log_sigmas = self.sigmas.log()
+        self.sigma_data = 1.0
+        self.unet = unet
+        return
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+    def timestep(self, sigma):
+        log_sigma = sigma.log()
+        dists = log_sigma.to(self.log_sigmas.device) - self.log_sigmas[:, None]
+        return dists.abs().argmin(dim=0).view(sigma.shape).to(sigma.device)
+    def get_sigmas_karras(self, n, rho=7.):
+        ramp = torch.linspace(0, 1, n)
+        min_inv_rho = self.sigma_min ** (1 / rho)
+        max_inv_rho = self.sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return torch.cat([sigmas, sigmas.new_zeros([1])])
+    def __call__(self, x, sigma, **extra_args):
+        x_ddim_space = x / (sigma[:, None, None, None] ** 2 + self.sigma_data ** 2) ** 0.5
+        x_ddim_space = x_ddim_space.to(dtype=self.unet.dtype)
+        t = self.timestep(sigma)
+        cfg_scale = extra_args['cfg_scale']
+        eps_positive = self.unet(x_ddim_space, t, return_dict=False, **extra_args['positive'])[0]
+        eps_negative = self.unet(x_ddim_space, t, return_dict=False, **extra_args['negative'])[0]
+        noise_pred = eps_negative + cfg_scale * (eps_positive - eps_negative)
+        return x - noise_pred * sigma[:, None, None, None]
+class KDiffusionSampler:
+    def __init__(self, unet, **kwargs):
+        self.unet = unet
+        self.k_model = KModel(unet=unet, **kwargs)
+    @torch.inference_mode()
+    def __call__(
+            self,
+            initial_latent = None,
+            strength = 1.0,
+            num_inference_steps = 25,
+            guidance_scale = 5.0,
+            batch_size = 1,
+            generator = None,
+            prompt_embeds = None,
+            negative_prompt_embeds = None,
+            cross_attention_kwargs = None,
+            same_noise_in_batch = False,
+            progress_tqdm = None,
+    ):
+        device = self.unet.device
+        # Sigmas
+        sigmas = self.k_model.get_sigmas_karras(int(num_inference_steps/strength))
+        sigmas = sigmas[-(num_inference_steps + 1):].to(device)
+        # Initial latents
+        if same_noise_in_batch:
+            noise = torch.randn(initial_latent.shape, generator=generator, device=device, dtype=self.unet.dtype).repeat(batch_size, 1, 1, 1)
+            initial_latent = initial_latent.repeat(batch_size, 1, 1, 1).to(device=device, dtype=self.unet.dtype)
+        else:
+            initial_latent = initial_latent.repeat(batch_size, 1, 1, 1).to(device=device, dtype=self.unet.dtype)
+            noise = torch.randn(initial_latent.shape, generator=generator, device=device, dtype=self.unet.dtype)
+        latents = initial_latent + noise * sigmas[0].to(initial_latent)
+        # Batch
+        latents = latents.to(device)
+        prompt_embeds = prompt_embeds.repeat(batch_size, 1, 1).to(device)
+        negative_prompt_embeds = negative_prompt_embeds.repeat(batch_size, 1, 1).to(device)
+        # Feeds
+        sampler_kwargs = dict(
+            cfg_scale=guidance_scale,
+            positive=dict(
+                encoder_hidden_states=prompt_embeds,
+                cross_attention_kwargs=cross_attention_kwargs
+            ),
+            negative=dict(
+                encoder_hidden_states=negative_prompt_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+        )
+        # Sample
+        results = sample_dpmpp_2m(self.k_model, latents, sigmas, extra_args=sampler_kwargs, progress_tqdm=progress_tqdm)
+        return results

diffusers_helper/utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import json
+import random
+import glob
+import torch
+import einops
+import torchvision
+import safetensors.torch as sf
+def write_to_json(data, file_path):
+    temp_file_path = file_path + ".tmp"
+    with open(temp_file_path, 'wt', encoding='utf-8') as temp_file:
+        json.dump(data, temp_file, indent=4)
+    os.replace(temp_file_path, file_path)
+    return
+def read_from_json(file_path):
+    with open(file_path, 'rt', encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+def get_active_parameters(m):
+    return {k:v for k, v in m.named_parameters() if v.requires_grad}
+def cast_training_params(m, dtype=torch.float32):
+    for param in m.parameters():
+        if param.requires_grad:
+            param.data = param.to(dtype)
+    return
+def set_attr_recursive(obj, attr, value):
+    attrs = attr.split(".")
+    for name in attrs[:-1]:
+        obj = getattr(obj, name)
+    setattr(obj, attrs[-1], value)
+    return
+@torch.no_grad()
+def batch_mixture(a, b, probability_a=0.5, mask_a=None):
+    assert a.shape == b.shape, "Tensors must have the same shape"
+    batch_size = a.size(0)
+    if mask_a is None:
+        mask_a = torch.rand(batch_size) < probability_a
+    mask_a = mask_a.to(a.device)
+    mask_a = mask_a.reshape((batch_size,) + (1,) * (a.dim() - 1))
+    result = torch.where(mask_a, a, b)
+    return result
+@torch.no_grad()
+def zero_module(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def load_last_state(model, folder='accelerator_output'):
+    file_pattern = os.path.join(folder, '**', 'model.safetensors')
+    files = glob.glob(file_pattern, recursive=True)
+    if not files:
+        print("No model.safetensors files found in the specified folder.")
+        return
+    newest_file = max(files, key=os.path.getmtime)
+    state_dict = sf.load_file(newest_file)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    if missing_keys:
+        print("Missing keys:", missing_keys)
+    if unexpected_keys:
+        print("Unexpected keys:", unexpected_keys)
+    print("Loaded model state from:", newest_file)
+    return
+def generate_random_prompt_from_tags(tags_str, min_length=3, max_length=32):
+    tags = tags_str.split(', ')
+    tags = random.sample(tags, k=min(random.randint(min_length, max_length), len(tags)))
+    prompt = ', '.join(tags)
+    return prompt
+def save_bcthw_as_mp4(x, output_filename, fps=10):
+    b, c, t, h, w = x.shape
+    per_row = b
+    for p in [6, 5, 4, 3, 2]:
+        if b % p == 0:
+            per_row = p
+            break
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, '(m n) c t h w -> t (m h) (n w) c', n=per_row)
+    torchvision.io.write_video(output_filename, x, fps=fps, video_codec='h264', options={'crf': '0'})
+    return x
+def save_bcthw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, 'b c t h w -> c (b h) (t w)')
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def add_tensors_with_padding(tensor1, tensor2):
+    if tensor1.shape == tensor2.shape:
+        return tensor1 + tensor2
+    shape1 = tensor1.shape
+    shape2 = tensor2.shape
+    new_shape = tuple(max(s1, s2) for s1, s2 in zip(shape1, shape2))
+    padded_tensor1 = torch.zeros(new_shape)
+    padded_tensor2 = torch.zeros(new_shape)
+    padded_tensor1[tuple(slice(0, s) for s in shape1)] = tensor1
+    padded_tensor2[tuple(slice(0, s) for s in shape2)] = tensor2
+    result = padded_tensor1 + padded_tensor2
+    return result

diffusers_vdm/attention.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import torch
+import xformers.ops
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange, repeat
+from functools import partial
+from diffusers_vdm.basics import zero_module, checkpoint, default, make_temporal_window
+def sdp(q, k, v, heads):
+    b, _, C = q.shape
+    dim_head = C // heads
+    q, k, v = map(
+        lambda t: t.unsqueeze(3)
+        .reshape(b, t.shape[1], heads, dim_head)
+        .permute(0, 2, 1, 3)
+        .reshape(b * heads, t.shape[1], dim_head)
+        .contiguous(),
+        (q, k, v),
+    )
+    out = xformers.ops.memory_efficient_attention(q, k, v)
+    out = (
+        out.unsqueeze(0)
+        .reshape(b, heads, out.shape[1], dim_head)
+        .permute(0, 2, 1, 3)
+        .reshape(b, out.shape[1], heads * dim_head)
+    )
+    return out
+class RelativePosition(nn.Module):
+    """ https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py """
+    def __init__(self, num_units, max_relative_position):
+        super().__init__()
+        self.num_units = num_units
+        self.max_relative_position = max_relative_position
+        self.embeddings_table = nn.Parameter(torch.Tensor(max_relative_position * 2 + 1, num_units))
+        nn.init.xavier_uniform_(self.embeddings_table)
+    def forward(self, length_q, length_k):
+        device = self.embeddings_table.device
+        range_vec_q = torch.arange(length_q, device=device)
+        range_vec_k = torch.arange(length_k, device=device)
+        distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
+        distance_mat_clipped = torch.clamp(distance_mat, -self.max_relative_position, self.max_relative_position)
+        final_mat = distance_mat_clipped + self.max_relative_position
+        final_mat = final_mat.long()
+        embeddings = self.embeddings_table[final_mat]
+        return embeddings
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.,
+                 relative_position=False, temporal_length=None, video_length=None, image_cross_attention=False,
+                 image_cross_attention_scale=1.0, image_cross_attention_scale_learnable=False,
+                 text_context_len=77, temporal_window_for_spatial_self_attention=False):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
+        self.is_temporal_attention = temporal_length is not None
+        self.relative_position = relative_position
+        if self.relative_position:
+            assert self.is_temporal_attention
+            self.relative_position_k = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+            self.relative_position_v = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+        self.video_length = video_length
+        self.temporal_window_for_spatial_self_attention = temporal_window_for_spatial_self_attention
+        self.temporal_window_type = 'prv'
+        self.image_cross_attention = image_cross_attention
+        self.image_cross_attention_scale = image_cross_attention_scale
+        self.text_context_len = text_context_len
+        self.image_cross_attention_scale_learnable = image_cross_attention_scale_learnable
+        if self.image_cross_attention:
+            self.to_k_ip = nn.Linear(context_dim, inner_dim, bias=False)
+            self.to_v_ip = nn.Linear(context_dim, inner_dim, bias=False)
+            if image_cross_attention_scale_learnable:
+                self.register_parameter('alpha', nn.Parameter(torch.tensor(0.)) )
+    def forward(self, x, context=None, mask=None):
+        if self.is_temporal_attention:
+            return self.temporal_forward(x, context=context, mask=mask)
+        else:
+            return self.spatial_forward(x, context=context, mask=mask)
+    def temporal_forward(self, x, context=None, mask=None):
+        assert mask is None, 'Attention mask not implemented!'
+        assert context is None, 'Temporal attention only supports self attention!'
+        q = self.to_q(x)
+        k = self.to_k(x)
+        v = self.to_v(x)
+        out = sdp(q, k, v, self.heads)
+        return self.to_out(out)
+    def spatial_forward(self, x, context=None, mask=None):
+        assert mask is None, 'Attention mask not implemented!'
+        spatial_self_attn = (context is None)
+        k_ip, v_ip, out_ip = None, None, None
+        q = self.to_q(x)
+        context = default(context, x)
+        if spatial_self_attn:
+            k = self.to_k(context)
+            v = self.to_v(context)
+            if self.temporal_window_for_spatial_self_attention:
+                k = make_temporal_window(k, t=self.video_length, method=self.temporal_window_type)
+                v = make_temporal_window(v, t=self.video_length, method=self.temporal_window_type)
+        elif self.image_cross_attention:
+            context, context_image = context
+            k = self.to_k(context)
+            v = self.to_v(context)
+            k_ip = self.to_k_ip(context_image)
+            v_ip = self.to_v_ip(context_image)
+        else:
+            raise NotImplementedError('Traditional prompt-only attention without IP-Adapter is illegal now.')
+        out = sdp(q, k, v, self.heads)
+        if k_ip is not None:
+            out_ip = sdp(q, k_ip, v_ip, self.heads)
+            if self.image_cross_attention_scale_learnable:
+                out = out + self.image_cross_attention_scale * out_ip * (torch.tanh(self.alpha) + 1)
+            else:
+                out = out + self.image_cross_attention_scale * out_ip
+        return self.to_out(out)
+class BasicTransformerBlock(nn.Module):
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
+                disable_self_attn=False, attention_cls=None, video_length=None, image_cross_attention=False, image_cross_attention_scale=1.0, image_cross_attention_scale_learnable=False, text_context_len=77):
+        super().__init__()
+        attn_cls = CrossAttention if attention_cls is None else attention_cls
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else None, video_length=video_length)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout, video_length=video_length, image_cross_attention=image_cross_attention, image_cross_attention_scale=image_cross_attention_scale, image_cross_attention_scale_learnable=image_cross_attention_scale_learnable,text_context_len=text_context_len)
+        self.image_cross_attention = image_cross_attention
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None, mask=None, **kwargs):
+        ## implementation tricks: because checkpointing doesn't support non-tensor (e.g. None or scalar) arguments
+        input_tuple = (x,)      ## should not be (x), otherwise *input_tuple will decouple x into multiple arguments
+        if context is not None:
+            input_tuple = (x, context)
+        if mask is not None:
+            forward_mask = partial(self._forward, mask=mask)
+            return checkpoint(forward_mask, (x,), self.parameters(), self.checkpoint)
+        return checkpoint(self._forward, input_tuple, self.parameters(), self.checkpoint)
+    def _forward(self, x, context=None, mask=None):
+        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None, mask=mask) + x
+        x = self.attn2(self.norm2(x), context=context, mask=mask) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data in spatial axis.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    NEW: use_linear for more efficiency instead of the 1x1 convs
+    """
+    def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None,
+                 use_checkpoint=True, disable_self_attn=False, use_linear=False, video_length=None,
+                 image_cross_attention=False, image_cross_attention_scale_learnable=False):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        if not use_linear:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        attention_cls = None
+        self.transformer_blocks = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim,
+                n_heads,
+                d_head,
+                dropout=dropout,
+                context_dim=context_dim,
+                disable_self_attn=disable_self_attn,
+                checkpoint=use_checkpoint,
+                attention_cls=attention_cls,
+                video_length=video_length,
+                image_cross_attention=image_cross_attention,
+                image_cross_attention_scale_learnable=image_cross_attention_scale_learnable,
+                ) for d in range(depth)
+        ])
+        if not use_linear:
+            self.proj_out = zero_module(nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
+        else:
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+    def forward(self, x, context=None, **kwargs):
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        for i, block in enumerate(self.transformer_blocks):
+            x = block(x, context=context, **kwargs)
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = self.proj_out(x)
+        return x + x_in
+class TemporalTransformer(nn.Module):
+    """
+    Transformer block for image-like data in temporal axis.
+    First, reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None,
+                 use_checkpoint=True, use_linear=False, only_self_att=True, causal_attention=False, causal_block_size=1,
+                 relative_position=False, temporal_length=None):
+        super().__init__()
+        self.only_self_att = only_self_att
+        self.relative_position = relative_position
+        self.causal_attention = causal_attention
+        self.causal_block_size = causal_block_size
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Conv1d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        if not use_linear:
+            self.proj_in = nn.Conv1d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        if relative_position:
+            assert(temporal_length is not None)
+            attention_cls = partial(CrossAttention, relative_position=True, temporal_length=temporal_length)
+        else:
+            attention_cls = partial(CrossAttention, temporal_length=temporal_length)
+        if self.causal_attention:
+            assert(temporal_length is not None)
+            self.mask = torch.tril(torch.ones([1, temporal_length, temporal_length]))
+        if self.only_self_att:
+            context_dim = None
+        self.transformer_blocks = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim,
+                n_heads,
+                d_head,
+                dropout=dropout,
+                context_dim=context_dim,
+                attention_cls=attention_cls,
+                checkpoint=use_checkpoint) for d in range(depth)
+        ])
+        if not use_linear:
+            self.proj_out = zero_module(nn.Conv1d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
+        else:
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+    def forward(self, x, context=None):
+        b, c, t, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = rearrange(x, 'b c t h w -> (b h w) c t').contiguous()
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, 'bhw c t -> bhw t c').contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        temp_mask = None
+        if self.causal_attention:
+            # slice the from mask map
+            temp_mask = self.mask[:,:t,:t].to(x.device)
+        if temp_mask is not None:
+            mask = temp_mask.to(x.device)
+            mask = repeat(mask, 'l i j -> (l bhw) i j', bhw=b*h*w)
+        else:
+            mask = None
+        if self.only_self_att:
+            ## note: if no context is given, cross-attention defaults to self-attention
+            for i, block in enumerate(self.transformer_blocks):
+                x = block(x, mask=mask)
+            x = rearrange(x, '(b hw) t c -> b hw t c', b=b).contiguous()
+        else:
+            x = rearrange(x, '(b hw) t c -> b hw t c', b=b).contiguous()
+            context = rearrange(context, '(b t) l con -> b t l con', t=t).contiguous()
+            for i, block in enumerate(self.transformer_blocks):
+                # calculate each batch one by one (since number in shape could not greater then 65,535 for some package)
+                for j in range(b):
+                    context_j = repeat(
+                        context[j],
+                        't l con -> (t r) l con', r=(h * w) // t, t=t).contiguous()
+                    ## note: causal mask will not applied in cross-attention case
+                    x[j] = block(x[j], context=context_j)
+        if self.use_linear:
+            x = self.proj_out(x)
+            x = rearrange(x, 'b (h w) t c -> b c t h w', h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = rearrange(x, 'b hw t c -> (b hw) c t').contiguous()
+            x = self.proj_out(x)
+            x = rearrange(x, '(b h w) c t -> b c t h w', b=b, h=h, w=w).contiguous()
+        return x + x_in
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)

diffusers_vdm/basics.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+import torch
+import torch.nn as nn
+import einops
+from inspect import isfunction
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def nonlinearity(type='silu'):
+    if type == 'silu':
+        return nn.SiLU()
+    elif type == 'leaky_relu':
+        return nn.LeakyReLU()
+def normalization(channels, num_groups=32):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return nn.GroupNorm(num_groups, channels)
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def exists(val):
+    return val is not None
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def make_temporal_window(x, t, method):
+    assert method in ['roll', 'prv', 'first']
+    if method == 'roll':
+        m = einops.rearrange(x, '(b t) d c -> b t d c', t=t)
+        l = torch.roll(m, shifts=1, dims=1)
+        r = torch.roll(m, shifts=-1, dims=1)
+        recon = torch.cat([l, m, r], dim=2)
+        del l, m, r
+        recon = einops.rearrange(recon, 'b t d c -> (b t) d c')
+        return recon
+    if method == 'prv':
+        x = einops.rearrange(x, '(b t) d c -> b t d c', t=t)
+        prv = torch.cat([x[:, :1], x[:, :-1]], dim=1)
+        recon = torch.cat([x, prv], dim=2)
+        del x, prv
+        recon = einops.rearrange(recon, 'b t d c -> (b t) d c')
+        return recon
+    if method == 'first':
+        x = einops.rearrange(x, '(b t) d c -> b t d c', t=t)
+        prv = x[:, [0], :, :].repeat(1, t, 1, 1)
+        recon = torch.cat([x, prv], dim=2)
+        del x, prv
+        recon = einops.rearrange(recon, 'b t d c -> (b t) d c')
+        return recon
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        return torch.utils.checkpoint.checkpoint(func, *inputs, use_reentrant=False)
+    else:
+        return func(*inputs)

diffusers_vdm/dynamic_tsnr_sampler.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# everything that can improve v-prediction model
+# dynamic scaling + tsnr + beta modifier + dynamic cfg rescale + ...
+# written by lvmin at stanford 2024
+import torch
+import numpy as np
+from tqdm import tqdm
+from functools import partial
+from diffusers_vdm.basics import extract_into_tensor
+to_torch = partial(torch.tensor, dtype=torch.float32)
+def rescale_zero_terminal_snr(betas):
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = np.cumprod(alphas, axis=0)
+    alphas_bar_sqrt = np.sqrt(alphas_cumprod)
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].copy()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].copy()
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = np.concatenate([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+    return betas
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+class SamplerDynamicTSNR(torch.nn.Module):
+    @torch.no_grad()
+    def __init__(self, unet, terminal_scale=0.7):
+        super().__init__()
+        self.unet = unet
+        self.is_v = True
+        self.n_timestep = 1000
+        self.guidance_rescale = 0.7
+        linear_start = 0.00085
+        linear_end = 0.012
+        betas = np.linspace(linear_start ** 0.5, linear_end ** 0.5, self.n_timestep, dtype=np.float64) ** 2
+        betas = rescale_zero_terminal_snr(betas)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod).to(unet.device))
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)).to(unet.device))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)).to(unet.device))
+        # Dynamic TSNR
+        turning_step = 400
+        scale_arr = np.concatenate([
+            np.linspace(1.0, terminal_scale, turning_step),
+            np.full(self.n_timestep - turning_step, terminal_scale)
+        ])
+        self.register_buffer('scale_arr', to_torch(scale_arr).to(unet.device))
+    def predict_eps_from_z_and_v(self, x_t, t, v):
+        return self.sqrt_alphas_cumprod[t] * v + self.sqrt_one_minus_alphas_cumprod[t] * x_t
+    def predict_start_from_z_and_v(self, x_t, t, v):
+        return self.sqrt_alphas_cumprod[t] * x_t - self.sqrt_one_minus_alphas_cumprod[t] * v
+    def q_sample(self, x0, t, noise):
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x0.shape) * x0 +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
+    def get_v(self, x0, t, noise):
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x0.shape) * noise -
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x0.shape) * x0)
+    def dynamic_x0_rescale(self, x0, t):
+        return x0 * extract_into_tensor(self.scale_arr, t, x0.shape)
+    @torch.no_grad()
+    def get_ground_truth(self, x0, noise, t):
+        x0 = self.dynamic_x0_rescale(x0, t)
+        xt = self.q_sample(x0, t, noise)
+        target = self.get_v(x0, t, noise) if self.is_v else noise
+        return xt, target
+    def get_uniform_trailing_steps(self, steps):
+        c = self.n_timestep / steps
+        ddim_timesteps = np.flip(np.round(np.arange(self.n_timestep, 0, -c))).astype(np.int64)
+        steps_out = ddim_timesteps - 1
+        return torch.tensor(steps_out, device=self.unet.device, dtype=torch.long)
+    @torch.no_grad()
+    def forward(self, latent_shape, steps, extra_args, progress_tqdm=None):
+        bar = tqdm if progress_tqdm is None else progress_tqdm
+        eta = 1.0
+        timesteps = self.get_uniform_trailing_steps(steps)
+        timesteps_prev = torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))
+        x = torch.randn(latent_shape, device=self.unet.device, dtype=self.unet.dtype)
+        alphas = self.alphas_cumprod[timesteps]
+        alphas_prev = self.alphas_cumprod[timesteps_prev]
+        scale_arr = self.scale_arr[timesteps]
+        scale_arr_prev = self.scale_arr[timesteps_prev]
+        sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
+        sigmas = eta * np.sqrt((1 - alphas_prev.cpu().numpy()) / (1 - alphas.cpu()) * (1 - alphas.cpu() / alphas_prev.cpu().numpy()))
+        s_in = x.new_ones((x.shape[0]))
+        s_x = x.new_ones((x.shape[0], ) + (1, ) * (x.ndim - 1))
+        for i in bar(range(len(timesteps))):
+            index = len(timesteps) - 1 - i
+            t = timesteps[index].item()
+            model_output = self.model_apply(x, t * s_in, **extra_args)
+            if self.is_v:
+                e_t = self.predict_eps_from_z_and_v(x, t, model_output)
+            else:
+                e_t = model_output
+            a_prev = alphas_prev[index].item() * s_x
+            sigma_t = sigmas[index].item() * s_x
+            if self.is_v:
+                pred_x0 = self.predict_start_from_z_and_v(x, t, model_output)
+            else:
+                a_t = alphas[index].item() * s_x
+                sqrt_one_minus_at = sqrt_one_minus_alphas[index].item() * s_x
+                pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            # dynamic rescale
+            scale_t = scale_arr[index].item() * s_x
+            prev_scale_t = scale_arr_prev[index].item() * s_x
+            rescale = (prev_scale_t / scale_t)
+            pred_x0 = pred_x0 * rescale
+            dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
+            noise = sigma_t * torch.randn_like(x)
+            x = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x
+    @torch.no_grad()
+    def model_apply(self, x, t, **extra_args):
+        x = x.to(device=self.unet.device, dtype=self.unet.dtype)
+        cfg_scale = extra_args['cfg_scale']
+        p = self.unet(x, t, **extra_args['positive'])
+        n = self.unet(x, t, **extra_args['negative'])
+        o = n + cfg_scale * (p - n)
+        o_better = rescale_noise_cfg(o, p, guidance_rescale=self.guidance_rescale)
+        return o_better

diffusers_vdm/improved_clip_vision.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# A CLIP Vision supporting arbitrary aspect ratios, by lllyasviel
+# The input range is changed to [-1, 1] rather than [0, 1] !!!! (same as VAE's range)
+import torch
+import types
+import einops
+from abc import ABCMeta
+from transformers import CLIPVisionModelWithProjection
+def preprocess(image):
+    mean = torch.tensor([0.48145466, 0.4578275, 0.40821073], device=image.device, dtype=image.dtype)[None, :, None, None]
+    std = torch.tensor([0.26862954, 0.26130258, 0.27577711], device=image.device, dtype=image.dtype)[None, :, None, None]
+    scale = 16 / min(image.shape[2], image.shape[3])
+    image = torch.nn.functional.interpolate(
+        image,
+        size=(14 * round(scale * image.shape[2]), 14 * round(scale * image.shape[3])),
+        mode="bicubic",
+        antialias=True
+    )
+    return (image - mean) / std
+def arbitrary_positional_encoding(p, H, W):
+    weight = p.weight
+    cls = weight[:1]
+    pos = weight[1:]
+    pos = einops.rearrange(pos, '(H W) C -> 1 C H W', H=16, W=16)
+    pos = torch.nn.functional.interpolate(pos, size=(H, W), mode="nearest")
+    pos = einops.rearrange(pos, '1 C H W -> (H W) C')
+    weight = torch.cat([cls, pos])[None]
+    return weight
+def improved_clipvision_embedding_forward(self, pixel_values):
+    pixel_values = pixel_values * 0.5 + 0.5
+    pixel_values = preprocess(pixel_values)
+    batch_size = pixel_values.shape[0]
+    target_dtype = self.patch_embedding.weight.dtype
+    patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+    B, C, H, W = patch_embeds.shape
+    patch_embeds = einops.rearrange(patch_embeds, 'B C H W -> B (H W) C')
+    class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+    embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+    embeddings = embeddings + arbitrary_positional_encoding(self.position_embedding, H, W)
+    return embeddings
+class ImprovedCLIPVisionModelWithProjection(CLIPVisionModelWithProjection, metaclass=ABCMeta):
+    def __init__(self, config):
+        super().__init__(config)
+        self.vision_model.embeddings.forward = types.MethodType(
+            improved_clipvision_embedding_forward,
+            self.vision_model.embeddings
+        )

diffusers_vdm/pipeline.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import torch
+import einops
+from diffusers import DiffusionPipeline
+from transformers import CLIPTextModel, CLIPTokenizer
+from huggingface_hub import snapshot_download
+from diffusers_vdm.vae import VideoAutoencoderKL
+from diffusers_vdm.projection import Resampler
+from diffusers_vdm.unet import UNet3DModel
+from diffusers_vdm.improved_clip_vision import ImprovedCLIPVisionModelWithProjection
+from diffusers_vdm.dynamic_tsnr_sampler import SamplerDynamicTSNR
+class LatentVideoDiffusionPipeline(DiffusionPipeline):
+    def __init__(self, tokenizer, text_encoder, image_encoder, vae, image_projection, unet, fp16=True, eval=True):
+        super().__init__()
+        self.loading_components = dict(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            image_encoder=image_encoder,
+            image_projection=image_projection
+        )
+        for k, v in self.loading_components.items():
+            setattr(self, k, v)
+        if fp16:
+            self.vae.half()
+            self.text_encoder.half()
+            self.unet.half()
+            self.image_encoder.half()
+            self.image_projection.half()
+        self.vae.requires_grad_(False)
+        self.text_encoder.requires_grad_(False)
+        self.image_encoder.requires_grad_(False)
+        self.vae.eval()
+        self.text_encoder.eval()
+        self.image_encoder.eval()
+        if eval:
+            self.unet.eval()
+            self.image_projection.eval()
+        else:
+            self.unet.train()
+            self.image_projection.train()
+    def to(self, *args, **kwargs):
+        for k, v in self.loading_components.items():
+            if hasattr(v, 'to'):
+                v.to(*args, **kwargs)
+        return self
+    def save_pretrained(self, save_directory, **kwargs):
+        for k, v in self.loading_components.items():
+            folder = os.path.join(save_directory, k)
+            os.makedirs(folder, exist_ok=True)
+            v.save_pretrained(folder)
+        return
+    @classmethod
+    def from_pretrained(cls, repo_id, fp16=True, eval=True, token=None):
+        local_folder = snapshot_download(repo_id=repo_id, token=token)
+        return cls(
+            tokenizer=CLIPTokenizer.from_pretrained(os.path.join(local_folder, "tokenizer")),
+            text_encoder=CLIPTextModel.from_pretrained(os.path.join(local_folder, "text_encoder")),
+            image_encoder=ImprovedCLIPVisionModelWithProjection.from_pretrained(os.path.join(local_folder, "image_encoder")),
+            vae=VideoAutoencoderKL.from_pretrained(os.path.join(local_folder, "vae")),
+            image_projection=Resampler.from_pretrained(os.path.join(local_folder, "image_projection")),
+            unet=UNet3DModel.from_pretrained(os.path.join(local_folder, "unet")),
+            fp16=fp16,
+            eval=eval
+        )
+    @torch.inference_mode()
+    def encode_cropped_prompt_77tokens(self, prompt: str):
+        cond_ids = self.tokenizer(prompt,
+                                  padding="max_length",
+                                  max_length=self.tokenizer.model_max_length,
+                                  truncation=True,
+                                  return_tensors="pt").input_ids.to(self.text_encoder.device)
+        cond = self.text_encoder(cond_ids, attention_mask=None).last_hidden_state
+        return cond
+    @torch.inference_mode()
+    def encode_clip_vision(self, frames):
+        b, c, t, h, w = frames.shape
+        frames = einops.rearrange(frames, 'b c t h w -> (b t) c h w')
+        clipvision_embed = self.image_encoder(frames).last_hidden_state
+        clipvision_embed = einops.rearrange(clipvision_embed, '(b t) d c -> b t d c', t=t)
+        return clipvision_embed
+    @torch.inference_mode()
+    def encode_latents(self, videos, return_hidden_states=True):
+        b, c, t, h, w = videos.shape
+        x = einops.rearrange(videos, 'b c t h w -> (b t) c h w')
+        encoder_posterior, hidden_states = self.vae.encode(x, return_hidden_states=return_hidden_states)
+        z = encoder_posterior.mode() * self.vae.scale_factor
+        z = einops.rearrange(z, '(b t) c h w -> b c t h w', b=b, t=t)
+        if not return_hidden_states:
+            return z
+        hidden_states = [einops.rearrange(h, '(b t) c h w -> b c t h w', b=b) for h in hidden_states]
+        hidden_states = [h[:, :, [0, -1], :, :] for h in hidden_states]  # only need first and last
+        return z, hidden_states
+    @torch.inference_mode()
+    def decode_latents(self, latents, hidden_states):
+        B, C, T, H, W = latents.shape
+        latents = einops.rearrange(latents, 'b c t h w -> (b t) c h w')
+        latents = latents.to(device=self.vae.device, dtype=self.vae.dtype) / self.vae.scale_factor
+        pixels = self.vae.decode(latents, ref_context=hidden_states, timesteps=T)
+        pixels = einops.rearrange(pixels, '(b t) c h w -> b c t h w', b=B, t=T)
+        return pixels
+    @torch.inference_mode()
+    def __call__(
+            self,
+            batch_size: int = 1,
+            steps: int = 50,
+            guidance_scale: float = 5.0,
+            positive_text_cond = None,
+            negative_text_cond = None,
+            positive_image_cond = None,
+            negative_image_cond = None,
+            concat_cond = None,
+            fs = 3,
+            progress_tqdm = None,
+    ):
+        unet_is_training = self.unet.training
+        if unet_is_training:
+            self.unet.eval()
+        device = self.unet.device
+        dtype = self.unet.dtype
+        dynamic_tsnr_model = SamplerDynamicTSNR(self.unet)
+        # Batch
+        concat_cond = concat_cond.repeat(batch_size, 1, 1, 1, 1).to(device=device, dtype=dtype)  # b, c, t, h, w
+        positive_text_cond = positive_text_cond.repeat(batch_size, 1, 1).to(concat_cond)  # b, f, c
+        negative_text_cond = negative_text_cond.repeat(batch_size, 1, 1).to(concat_cond)  # b, f, c
+        positive_image_cond = positive_image_cond.repeat(batch_size, 1, 1, 1).to(concat_cond)  # b, t, l, c
+        negative_image_cond = negative_image_cond.repeat(batch_size, 1, 1, 1).to(concat_cond)
+        if isinstance(fs, torch.Tensor):
+            fs = fs.repeat(batch_size, ).to(dtype=torch.long, device=device)  # b
+        else:
+            fs = torch.tensor([fs] * batch_size, dtype=torch.long, device=device)  # b
+        # Initial latents
+        latent_shape = concat_cond.shape
+        # Feeds
+        sampler_kwargs = dict(
+            cfg_scale=guidance_scale,
+            positive=dict(
+                context_text=positive_text_cond,
+                context_img=positive_image_cond,
+                fs=fs,
+                concat_cond=concat_cond
+            ),
+            negative=dict(
+                context_text=negative_text_cond,
+                context_img=negative_image_cond,
+                fs=fs,
+                concat_cond=concat_cond
+            )
+        )
+        # Sample
+        results = dynamic_tsnr_model(latent_shape, steps, extra_args=sampler_kwargs, progress_tqdm=progress_tqdm)
+        if unet_is_training:
+            self.unet.train()
+        return results

diffusers_vdm/projection.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+# and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py
+# and https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/resampler.py
+import math
+import torch
+import einops
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+class ImageProjModel(nn.Module):
+    """Projection Model"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = nn.LayerNorm(cross_attention_dim)
+    def forward(self, image_embeds):
+        #embeds = image_embeds
+        embeds = image_embeds.type(list(self.proj.parameters())[0].dtype)
+        clip_extra_context_tokens = self.proj(embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim)
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class Resampler(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+        video_length=16,
+        input_frames_length=2,
+    ):
+        super().__init__()
+        self.num_queries = num_queries
+        self.video_length = video_length
+        self.latents = nn.Parameter(torch.randn(1, num_queries * video_length, dim) / dim**0.5)
+        self.input_pos = nn.Parameter(torch.zeros(1, input_frames_length, 1, embedding_dim))
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = x + self.input_pos
+        x = einops.rearrange(x, 'b ti d c -> b (ti d) c')
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        latents = self.norm_out(latents)
+        latents = einops.rearrange(latents, 'b (to l) c -> b to l c', to=self.video_length)
+        return latents
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype

diffusers_vdm/unet.py ADDED Viewed

	@@ -0,0 +1,650 @@

+# https://github.com/AILab-CVC/VideoCrafter
+# https://github.com/Doubiiu/DynamiCrafter
+# https://github.com/ToonCrafter/ToonCrafter
+# Then edited by lllyasviel
+from functools import partial
+from abc import abstractmethod
+import torch
+import math
+import torch.nn as nn
+from einops import rearrange, repeat
+import torch.nn.functional as F
+from diffusers_vdm.basics import checkpoint
+from diffusers_vdm.basics import (
+    zero_module,
+    conv_nd,
+    linear,
+    avg_pool_nd,
+    normalization
+)
+from diffusers_vdm.attention import SpatialTransformer, TemporalTransformer
+from huggingface_hub import PyTorchModelHubMixin
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+    def forward(self, x, emb, context=None, batch_size=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb, batch_size=batch_size)
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context)
+            elif isinstance(layer, TemporalTransformer):
+                x = rearrange(x, '(b f) c h w -> b c f h w', b=batch_size)
+                x = layer(x, context)
+                x = rearrange(x, 'b c f h w -> (b f) c h w')
+            else:
+                x = layer(x)
+        return x
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode='nearest')
+        else:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    :param use_temporal_conv: if True, use the temporal convolution.
+    :param use_image_dataset: if True, the temporal parameters will not be optimized.
+    """
+    def __init__(
+            self,
+            channels,
+            emb_channels,
+            dropout,
+            out_channels=None,
+            use_scale_shift_norm=False,
+            dims=2,
+            use_checkpoint=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            use_temporal_conv=False,
+            tempspatial_aware=False
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.use_temporal_conv = use_temporal_conv
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(nn.Conv2d(self.out_channels, self.out_channels, 3, padding=1)),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+        if self.use_temporal_conv:
+            self.temopral_conv = TemporalConvBlock(
+                self.out_channels,
+                self.out_channels,
+                dropout=0.1,
+                spatial_aware=tempspatial_aware
+            )
+    def forward(self, x, emb, batch_size=None):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        input_tuple = (x, emb)
+        if batch_size:
+            forward_batchsize = partial(self._forward, batch_size=batch_size)
+            return checkpoint(forward_batchsize, input_tuple, self.parameters(), self.use_checkpoint)
+        return checkpoint(self._forward, input_tuple, self.parameters(), self.use_checkpoint)
+    def _forward(self, x, emb, batch_size=None):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        h = self.skip_connection(x) + h
+        if self.use_temporal_conv and batch_size:
+            h = rearrange(h, '(b t) c h w -> b c t h w', b=batch_size)
+            h = self.temopral_conv(h)
+            h = rearrange(h, 'b c t h w -> (b t) c h w')
+        return h
+class TemporalConvBlock(nn.Module):
+    """
+    Adapted from modelscope: https://github.com/modelscope/modelscope/blob/master/modelscope/models/multi_modal/video_synthesis/unet_sd.py
+    """
+    def __init__(self, in_channels, out_channels=None, dropout=0.0, spatial_aware=False):
+        super(TemporalConvBlock, self).__init__()
+        if out_channels is None:
+            out_channels = in_channels
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        th_kernel_shape = (3, 1, 1) if not spatial_aware else (3, 3, 1)
+        th_padding_shape = (1, 0, 0) if not spatial_aware else (1, 1, 0)
+        tw_kernel_shape = (3, 1, 1) if not spatial_aware else (3, 1, 3)
+        tw_padding_shape = (1, 0, 0) if not spatial_aware else (1, 0, 1)
+        # conv layers
+        self.conv1 = nn.Sequential(
+            nn.GroupNorm(32, in_channels), nn.SiLU(),
+            nn.Conv3d(in_channels, out_channels, th_kernel_shape, padding=th_padding_shape))
+        self.conv2 = nn.Sequential(
+            nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv3d(out_channels, in_channels, tw_kernel_shape, padding=tw_padding_shape))
+        self.conv3 = nn.Sequential(
+            nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv3d(out_channels, in_channels, th_kernel_shape, padding=th_padding_shape))
+        self.conv4 = nn.Sequential(
+            nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv3d(out_channels, in_channels, tw_kernel_shape, padding=tw_padding_shape))
+        # zero out the last layer params,so the conv block is identity
+        nn.init.zeros_(self.conv4[-1].weight)
+        nn.init.zeros_(self.conv4[-1].bias)
+    def forward(self, x):
+        identity = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        return identity + x
+class UNet3DModel(nn.Module, PyTorchModelHubMixin):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: in_channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+    def __init__(self,
+                 in_channels,
+                 model_channels,
+                 out_channels,
+                 num_res_blocks,
+                 attention_resolutions,
+                 dropout=0.0,
+                 channel_mult=(1, 2, 4, 8),
+                 conv_resample=True,
+                 dims=2,
+                 context_dim=None,
+                 use_scale_shift_norm=False,
+                 resblock_updown=False,
+                 num_heads=-1,
+                 num_head_channels=-1,
+                 transformer_depth=1,
+                 use_linear=False,
+                 temporal_conv=False,
+                 tempspatial_aware=False,
+                 temporal_attention=True,
+                 use_relative_position=True,
+                 use_causal_attention=False,
+                 temporal_length=None,
+                 addition_attention=False,
+                 temporal_selfatt_only=True,
+                 image_cross_attention=False,
+                 image_cross_attention_scale_learnable=False,
+                 default_fs=4,
+                 fs_condition=False,
+                 ):
+        super(UNet3DModel, self).__init__()
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.temporal_attention = temporal_attention
+        time_embed_dim = model_channels * 4
+        self.use_checkpoint = use_checkpoint = False  # moved to self.enable_gradient_checkpointing()
+        temporal_self_att_only = True
+        self.addition_attention = addition_attention
+        self.temporal_length = temporal_length
+        self.image_cross_attention = image_cross_attention
+        self.image_cross_attention_scale_learnable = image_cross_attention_scale_learnable
+        self.default_fs = default_fs
+        self.fs_condition = fs_condition
+        ## Time embedding blocks
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if fs_condition:
+            self.fps_embedding = nn.Sequential(
+                linear(model_channels, time_embed_dim),
+                nn.SiLU(),
+                linear(time_embed_dim, time_embed_dim),
+            )
+            nn.init.zeros_(self.fps_embedding[-1].weight)
+            nn.init.zeros_(self.fps_embedding[-1].bias)
+        ## Input Block
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(conv_nd(dims, in_channels, model_channels, 3, padding=1))
+            ]
+        )
+        if self.addition_attention:
+            self.init_attn = TimestepEmbedSequential(
+                TemporalTransformer(
+                    model_channels,
+                    n_heads=8,
+                    d_head=num_head_channels,
+                    depth=transformer_depth,
+                    context_dim=context_dim,
+                    use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only,
+                    causal_attention=False, relative_position=use_relative_position,
+                    temporal_length=temporal_length))
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(ch, time_embed_dim, dropout,
+                             out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint,
+                             use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                             use_temporal_conv=temporal_conv
+                             )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    layers.append(
+                        SpatialTransformer(ch, num_heads, dim_head,
+                                           depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                           use_checkpoint=use_checkpoint, disable_self_attn=False,
+                                           video_length=temporal_length,
+                                           image_cross_attention=self.image_cross_attention,
+                                           image_cross_attention_scale_learnable=self.image_cross_attention_scale_learnable,
+                                           )
+                    )
+                    if self.temporal_attention:
+                        layers.append(
+                            TemporalTransformer(ch, num_heads, dim_head,
+                                                depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                                use_checkpoint=use_checkpoint, only_self_att=temporal_self_att_only,
+                                                causal_attention=use_causal_attention,
+                                                relative_position=use_relative_position,
+                                                temporal_length=temporal_length
+                                                )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(ch, time_embed_dim, dropout,
+                                 out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint,
+                                 use_scale_shift_norm=use_scale_shift_norm,
+                                 down=True
+                                 )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        layers = [
+            ResBlock(ch, time_embed_dim, dropout,
+                     dims=dims, use_checkpoint=use_checkpoint,
+                     use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                     use_temporal_conv=temporal_conv
+                     ),
+            SpatialTransformer(ch, num_heads, dim_head,
+                               depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                               use_checkpoint=use_checkpoint, disable_self_attn=False, video_length=temporal_length,
+                               image_cross_attention=self.image_cross_attention,
+                               image_cross_attention_scale_learnable=self.image_cross_attention_scale_learnable
+                               )
+        ]
+        if self.temporal_attention:
+            layers.append(
+                TemporalTransformer(ch, num_heads, dim_head,
+                                    depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                    use_checkpoint=use_checkpoint, only_self_att=temporal_self_att_only,
+                                    causal_attention=use_causal_attention, relative_position=use_relative_position,
+                                    temporal_length=temporal_length
+                                    )
+            )
+        layers.append(
+            ResBlock(ch, time_embed_dim, dropout,
+                     dims=dims, use_checkpoint=use_checkpoint,
+                     use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                     use_temporal_conv=temporal_conv
+                     )
+        )
+        ## Middle Block
+        self.middle_block = TimestepEmbedSequential(*layers)
+        ## Output Block
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(ch + ich, time_embed_dim, dropout,
+                             out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint,
+                             use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                             use_temporal_conv=temporal_conv
+                             )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    layers.append(
+                        SpatialTransformer(ch, num_heads, dim_head,
+                                           depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                           use_checkpoint=use_checkpoint, disable_self_attn=False,
+                                           video_length=temporal_length,
+                                           image_cross_attention=self.image_cross_attention,
+                                           image_cross_attention_scale_learnable=self.image_cross_attention_scale_learnable
+                                           )
+                    )
+                    if self.temporal_attention:
+                        layers.append(
+                            TemporalTransformer(ch, num_heads, dim_head,
+                                                depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                                use_checkpoint=use_checkpoint, only_self_att=temporal_self_att_only,
+                                                causal_attention=use_causal_attention,
+                                                relative_position=use_relative_position,
+                                                temporal_length=temporal_length
+                                                )
+                        )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(ch, time_embed_dim, dropout,
+                                 out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint,
+                                 use_scale_shift_norm=use_scale_shift_norm,
+                                 up=True
+                                 )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def forward(self, x, timesteps, context_text=None, context_img=None, concat_cond=None, fs=None, **kwargs):
+        b, _, t, _, _ = x.shape
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).type(x.dtype)
+        emb = self.time_embed(t_emb)
+        context_text = context_text.repeat_interleave(repeats=t, dim=0)
+        context_img = rearrange(context_img, 'b t l c -> (b t) l c')
+        context = (context_text, context_img)
+        emb = emb.repeat_interleave(repeats=t, dim=0)
+        if concat_cond is not None:
+            x = torch.cat([x, concat_cond], dim=1)
+        ## always in shape (b t) c h w, except for temporal layer
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        ## combine emb
+        if self.fs_condition:
+            if fs is None:
+                fs = torch.tensor(
+                    [self.default_fs] * b, dtype=torch.long, device=x.device)
+            fs_emb = timestep_embedding(fs, self.model_channels, repeat_only=False).type(x.dtype)
+            fs_embed = self.fps_embedding(fs_emb)
+            fs_embed = fs_embed.repeat_interleave(repeats=t, dim=0)
+            emb = emb + fs_embed
+        h = x
+        hs = []
+        for id, module in enumerate(self.input_blocks):
+            h = module(h, emb, context=context, batch_size=b)
+            if id == 0 and self.addition_attention:
+                h = self.init_attn(h, emb, context=context, batch_size=b)
+            hs.append(h)
+        h = self.middle_block(h, emb, context=context, batch_size=b)
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context=context, batch_size=b)
+        h = h.type(x.dtype)
+        y = self.out(h)
+        y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
+        return y
+    def enable_gradient_checkpointing(self, enable=True, verbose=False):
+        for k, v in self.named_modules():
+            if hasattr(v, 'checkpoint'):
+                v.checkpoint = enable
+                if verbose:
+                    print(f'{k}.checkpoint = {enable}')
+            if hasattr(v, 'use_checkpoint'):
+                v.use_checkpoint = enable
+                if verbose:
+                    print(f'{k}.use_checkpoint = {enable}')
+        return

diffusers_vdm/utils.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import cv2
+import torch
+import einops
+import torchvision
+def resize_and_center_crop(image, target_width, target_height, interpolation=cv2.INTER_AREA):
+    original_height, original_width = image.shape[:2]
+    k = max(target_height / original_height, target_width / original_width)
+    new_width = int(round(original_width * k))
+    new_height = int(round(original_height * k))
+    resized_image = cv2.resize(image, (new_width, new_height), interpolation=interpolation)
+    x_start = (new_width - target_width) // 2
+    y_start = (new_height - target_height) // 2
+    cropped_image = resized_image[y_start:y_start + target_height, x_start:x_start + target_width]
+    return cropped_image
+def save_bcthw_as_mp4(x, output_filename, fps=10):
+    b, c, t, h, w = x.shape
+    per_row = b
+    for p in [6, 5, 4, 3, 2]:
+        if b % p == 0:
+            per_row = p
+            break
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, '(m n) c t h w -> t (m h) (n w) c', n=per_row)
+    torchvision.io.write_video(output_filename, x, fps=fps, video_codec='h264', options={'crf': '1'})
+    return x
+def save_bcthw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, 'b c t h w -> c (b h) (t w)')
+    torchvision.io.write_png(x, output_filename)
+    return output_filename

diffusers_vdm/vae.py ADDED Viewed

	@@ -0,0 +1,826 @@

+# video VAE with many components from lots of repos
+# collected by lvmin
+import torch
+import xformers.ops
+import torch.nn as nn
+from einops import rearrange, repeat
+from diffusers_vdm.basics import default, exists, zero_module, conv_nd, linear, normalization
+from diffusers_vdm.unet import Upsample, Downsample
+from huggingface_hub import PyTorchModelHubMixin
+def chunked_attention(q, k, v, batch_chunk=0):
+    # if batch_chunk > 0 and not torch.is_grad_enabled():
+    #     batch_size = q.size(0)
+    #     chunks = [slice(i, i + batch_chunk) for i in range(0, batch_size, batch_chunk)]
+    #
+    #     out_chunks = []
+    #     for chunk in chunks:
+    #         q_chunk = q[chunk]
+    #         k_chunk = k[chunk]
+    #         v_chunk = v[chunk]
+    #
+    #         out_chunk = torch.nn.functional.scaled_dot_product_attention(
+    #             q_chunk, k_chunk, v_chunk, attn_mask=None
+    #         )
+    #         out_chunks.append(out_chunk)
+    #
+    #     out = torch.cat(out_chunks, dim=0)
+    # else:
+    #     out = torch.nn.functional.scaled_dot_product_attention(
+    #         q, k, v, attn_mask=None
+    #     )
+    out = xformers.ops.memory_efficient_attention(q, k, v)
+    return out
+def nonlinearity(x):
+    return x * torch.sigmoid(x)
+def GroupNorm(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+class DiagonalGaussianDistribution:
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+    def sample(self, noise=None):
+        if noise is None:
+            noise = torch.randn(self.mean.shape)
+        x = self.mean + self.std * noise.to(device=self.parameters.device)
+        return x
+    def mode(self):
+        return self.mean
+class EncoderDownSampleBlock(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        self.in_channels = in_channels
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = GroupNorm(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = GroupNorm(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1, 2, 4, 8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, **kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(Attention(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = EncoderDownSampleBlock(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = Attention(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = GroupNorm(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2 * z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, return_hidden_states=False):
+        # timestep embedding
+        temb = None
+        # print(f'encoder-input={x.shape}')
+        # downsampling
+        hs = [self.conv_in(x)]
+        ## if we return hidden states for decoder usage, we will store them in a list
+        if return_hidden_states:
+            hidden_states = []
+        # print(f'encoder-conv in feat={hs[0].shape}')
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                # print(f'encoder-down feat={h.shape}')
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if return_hidden_states:
+                hidden_states.append(h)
+            if i_level != self.num_resolutions - 1:
+                # print(f'encoder-downsample (input)={hs[-1].shape}')
+                hs.append(self.down[i_level].downsample(hs[-1]))
+                # print(f'encoder-downsample (output)={hs[-1].shape}')
+        if return_hidden_states:
+            hidden_states.append(hs[0])
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        # print(f'encoder-mid1 feat={h.shape}')
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # print(f'encoder-mid2 feat={h.shape}')
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        # print(f'end feat={h.shape}')
+        if return_hidden_states:
+            return h, hidden_states
+        else:
+            return h
+class ConvCombiner(nn.Module):
+    def __init__(self, ch):
+        super().__init__()
+        self.conv = nn.Conv2d(ch, ch, 1, padding=0)
+        nn.init.zeros_(self.conv.weight)
+        nn.init.zeros_(self.conv.bias)
+    def forward(self, x, context):
+        ## x: b c h w, context: b c 2 h w
+        b, c, l, h, w = context.shape
+        bt, c, h, w = x.shape
+        context = rearrange(context, "b c l h w -> (b l) c h w")
+        context = self.conv(context)
+        context = rearrange(context, "(b l) c h w -> b c l h w", l=l)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=bt // b)
+        x[:, :, 0] = x[:, :, 0] + context[:, :, 0]
+        x[:, :, -1] = x[:, :, -1] + context[:, :, -1]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+class AttentionCombiner(nn.Module):
+    def __init__(
+            self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.attention_op = None
+        self.norm = GroupNorm(query_dim)
+        nn.init.zeros_(self.to_out[0].weight)
+        nn.init.zeros_(self.to_out[0].bias)
+    def forward(
+            self,
+            x,
+            context=None,
+            mask=None,
+    ):
+        bt, c, h, w = x.shape
+        h_ = self.norm(x)
+        h_ = rearrange(h_, "b c h w -> b (h w) c")
+        q = self.to_q(h_)
+        b, c, l, h, w = context.shape
+        context = rearrange(context, "b c l h w -> (b l) (h w) c")
+        k = self.to_k(context)
+        v = self.to_v(context)
+        t = bt // b
+        k = repeat(k, "(b l) d c -> (b t) (l d) c", l=l, t=t)
+        v = repeat(v, "(b l) d c -> (b t) (l d) c", l=l, t=t)
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        out = chunked_attention(
+            q, k, v, batch_chunk=1
+        )
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        out = self.to_out(out)
+        out = rearrange(out, "bt (h w) c -> bt c h w", h=h, w=w, c=c)
+        return x + out
+class Attention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = GroupNorm(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def attention(self, h_: torch.Tensor) -> torch.Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        B, C, H, W = q.shape
+        q, k, v = map(lambda x: rearrange(x, "b c h w -> b (h w) c"), (q, k, v))
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(B, t.shape[1], 1, C)
+            .permute(0, 2, 1, 3)
+            .reshape(B * 1, t.shape[1], C)
+            .contiguous(),
+            (q, k, v),
+        )
+        out = chunked_attention(
+            q, k, v, batch_chunk=1
+        )
+        out = (
+            out.unsqueeze(0)
+            .reshape(B, 1, out.shape[1], C)
+            .permute(0, 2, 1, 3)
+            .reshape(B, out.shape[1], C)
+        )
+        return rearrange(out, "b (h w) c -> b c h w", b=B, h=H, w=W, c=C)
+    def forward(self, x, **kwargs):
+        h_ = x
+        h_ = self.attention(h_)
+        h_ = self.proj_out(h_)
+        return x + h_
+class VideoDecoder(nn.Module):
+    def __init__(
+            self,
+            *,
+            ch,
+            out_ch,
+            ch_mult=(1, 2, 4, 8),
+            num_res_blocks,
+            attn_resolutions,
+            dropout=0.0,
+            resamp_with_conv=True,
+            in_channels,
+            resolution,
+            z_channels,
+            give_pre_end=False,
+            tanh_out=False,
+            use_linear_attn=False,
+            attn_level=[2, 3],
+            video_kernel_size=[3, 1, 1],
+            alpha: float = 0.0,
+            merge_strategy: str = "learned",
+            **kwargs,
+    ):
+        super().__init__()
+        self.video_kernel_size = video_kernel_size
+        self.alpha = alpha
+        self.merge_strategy = merge_strategy
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+        self.attn_level = attn_level
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = VideoResBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            video_kernel_size=self.video_kernel_size,
+            alpha=self.alpha,
+            merge_strategy=self.merge_strategy,
+        )
+        self.mid.attn_1 = Attention(block_in)
+        self.mid.block_2 = VideoResBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            video_kernel_size=self.video_kernel_size,
+            alpha=self.alpha,
+            merge_strategy=self.merge_strategy,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        self.attn_refinement = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    VideoResBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                        video_kernel_size=self.video_kernel_size,
+                        alpha=self.alpha,
+                        merge_strategy=self.merge_strategy,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(Attention(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+            if i_level in self.attn_level:
+                self.attn_refinement.insert(0, AttentionCombiner(block_in))
+            else:
+                self.attn_refinement.insert(0, ConvCombiner(block_in))
+        # end
+        self.norm_out = GroupNorm(block_in)
+        self.attn_refinement.append(ConvCombiner(block_in))
+        self.conv_out = DecoderConv3D(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1, video_kernel_size=self.video_kernel_size
+        )
+    def forward(self, z, ref_context=None, **kwargs):
+        ## ref_context: b c 2 h w, 2 means starting and ending frame
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb, **kwargs)
+        h = self.mid.attn_1(h, **kwargs)
+        h = self.mid.block_2(h, temb, **kwargs)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb, **kwargs)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, **kwargs)
+            if ref_context:
+                h = self.attn_refinement[i_level](x=h, context=ref_context[i_level])
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        if ref_context:
+            # print(h.shape, ref_context[i_level].shape) #torch.Size([8, 128, 256, 256]) torch.Size([1, 128, 2, 256, 256])
+            h = self.attn_refinement[-1](x=h, context=ref_context[-1])
+        h = self.conv_out(h, **kwargs)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+class TimeStackBlock(torch.nn.Module):
+    def __init__(
+            self,
+            channels: int,
+            emb_channels: int,
+            dropout: float,
+            out_channels: int = None,
+            use_conv: bool = False,
+            use_scale_shift_norm: bool = False,
+            dims: int = 2,
+            use_checkpoint: bool = False,
+            up: bool = False,
+            down: bool = False,
+            kernel_size: int = 3,
+            exchange_temb_dims: bool = False,
+            skip_t_emb: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.exchange_temb_dims = exchange_temb_dims
+        if isinstance(kernel_size, list):
+            padding = [k // 2 for k in kernel_size]
+        else:
+            padding = kernel_size // 2
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.skip_t_emb = skip_t_emb
+        self.emb_out_channels = (
+            2 * self.out_channels if use_scale_shift_norm else self.out_channels
+        )
+        if self.skip_t_emb:
+            # print(f"Skipping timestep embedding in {self.__class__.__name__}")
+            assert not self.use_scale_shift_norm
+            self.emb_layers = None
+            self.exchange_temb_dims = False
+        else:
+            self.emb_layers = nn.Sequential(
+                nn.SiLU(),
+                linear(
+                    emb_channels,
+                    self.emb_out_channels,
+                ),
+            )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(
+                    dims,
+                    self.out_channels,
+                    self.out_channels,
+                    kernel_size,
+                    padding=padding,
+                )
+            ),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, kernel_size, padding=padding
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        if self.skip_t_emb:
+            emb_out = torch.zeros_like(h)
+        else:
+            emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            if self.exchange_temb_dims:
+                emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class VideoResBlock(ResnetBlock):
+    def __init__(
+            self,
+            out_channels,
+            *args,
+            dropout=0.0,
+            video_kernel_size=3,
+            alpha=0.0,
+            merge_strategy="learned",
+            **kwargs,
+    ):
+        super().__init__(out_channels=out_channels, dropout=dropout, *args, **kwargs)
+        if video_kernel_size is None:
+            video_kernel_size = [3, 1, 1]
+        self.time_stack = TimeStackBlock(
+            channels=out_channels,
+            emb_channels=0,
+            dropout=dropout,
+            dims=3,
+            use_scale_shift_norm=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            kernel_size=video_kernel_size,
+            use_checkpoint=True,
+            skip_t_emb=True,
+        )
+        self.merge_strategy = merge_strategy
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned":
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+    def get_alpha(self, bs):
+        if self.merge_strategy == "fixed":
+            return self.mix_factor
+        elif self.merge_strategy == "learned":
+            return torch.sigmoid(self.mix_factor)
+        else:
+            raise NotImplementedError()
+    def forward(self, x, temb, skip_video=False, timesteps=None):
+        assert isinstance(timesteps, int)
+        b, c, h, w = x.shape
+        x = super().forward(x, temb)
+        if not skip_video:
+            x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+            x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+            x = self.time_stack(x, temb)
+            alpha = self.get_alpha(bs=b // timesteps)
+            x = alpha * x + (1.0 - alpha) * x_mix
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+class DecoderConv3D(torch.nn.Conv2d):
+    def __init__(self, in_channels, out_channels, video_kernel_size=3, *args, **kwargs):
+        super().__init__(in_channels, out_channels, *args, **kwargs)
+        if isinstance(video_kernel_size, list):
+            padding = [int(k // 2) for k in video_kernel_size]
+        else:
+            padding = int(video_kernel_size // 2)
+        self.time_mix_conv = torch.nn.Conv3d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=video_kernel_size,
+            padding=padding,
+        )
+    def forward(self, input, timesteps, skip_video=False):
+        x = super().forward(input)
+        if skip_video:
+            return x
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+        x = self.time_mix_conv(x)
+        return rearrange(x, "b c t h w -> (b t) c h w")
+class VideoAutoencoderKL(torch.nn.Module, PyTorchModelHubMixin):
+    def __init__(self,
+                 double_z=True,
+                 z_channels=4,
+                 resolution=256,
+                 in_channels=3,
+                 out_ch=3,
+                 ch=128,
+                 ch_mult=[],
+                 num_res_blocks=2,
+                 attn_resolutions=[],
+                 dropout=0.0,
+                 ):
+        super().__init__()
+        self.encoder = Encoder(double_z=double_z, z_channels=z_channels, resolution=resolution, in_channels=in_channels,
+                               out_ch=out_ch, ch=ch, ch_mult=ch_mult, num_res_blocks=num_res_blocks,
+                               attn_resolutions=attn_resolutions, dropout=dropout)
+        self.decoder = VideoDecoder(double_z=double_z, z_channels=z_channels, resolution=resolution,
+                                    in_channels=in_channels, out_ch=out_ch, ch=ch, ch_mult=ch_mult,
+                                    num_res_blocks=num_res_blocks, attn_resolutions=attn_resolutions, dropout=dropout)
+        self.quant_conv = torch.nn.Conv2d(2 * z_channels, 2 * z_channels, 1)
+        self.post_quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        self.scale_factor = 0.18215
+    def encode(self, x, return_hidden_states=False, **kwargs):
+        if return_hidden_states:
+            h, hidden = self.encoder(x, return_hidden_states)
+            moments = self.quant_conv(h)
+            posterior = DiagonalGaussianDistribution(moments)
+            return posterior, hidden
+        else:
+            h = self.encoder(x)
+            moments = self.quant_conv(h)
+            posterior = DiagonalGaussianDistribution(moments)
+            return posterior, None
+    def decode(self, z, **kwargs):
+        if len(kwargs) == 0:
+            z = self.post_quant_conv(z)
+        dec = self.decoder(z, **kwargs)
+        return dec
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype

gradio_app.py ADDED Viewed

	@@ -0,0 +1,321 @@

+import os
+os.environ['HF_HOME'] = os.path.join(os.path.dirname(__file__), 'hf_download')
+result_dir = os.path.join('./', 'results')
+os.makedirs(result_dir, exist_ok=True)
+import functools
+import os
+import random
+import gradio as gr
+import numpy as np
+import torch
+import wd14tagger
+import memory_management
+import uuid
+from PIL import Image
+from diffusers_helper.code_cond import unet_add_coded_conds
+from diffusers_helper.cat_cond import unet_add_concat_conds
+from diffusers_helper.k_diffusion import KDiffusionSampler
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnProcessor2_0
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers_vdm.pipeline import LatentVideoDiffusionPipeline
+from diffusers_vdm.utils import resize_and_center_crop, save_bcthw_as_mp4
+class ModifiedUNet(UNet2DConditionModel):
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        m = super().from_config(*args, **kwargs)
+        unet_add_concat_conds(unet=m, new_channels=4)
+        unet_add_coded_conds(unet=m, added_number_count=1)
+        return m
+model_name = 'lllyasviel/paints_undo_single_frame'
+tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16)
+vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(torch.bfloat16)  # bfloat16 vae
+unet = ModifiedUNet.from_pretrained(model_name, subfolder="unet").to(torch.float16)
+unet.set_attn_processor(AttnProcessor2_0())
+vae.set_attn_processor(AttnProcessor2_0())
+video_pipe = LatentVideoDiffusionPipeline.from_pretrained(
+    'lllyasviel/paints_undo_multi_frame',
+    fp16=True
+)
+memory_management.unload_all_models([
+    video_pipe.unet, video_pipe.vae, video_pipe.text_encoder, video_pipe.image_projection, video_pipe.image_encoder,
+    unet, vae, text_encoder
+])
+k_sampler = KDiffusionSampler(
+    unet=unet,
+    timesteps=1000,
+    linear_start=0.00085,
+    linear_end=0.020,
+    linear=True
+)
+def find_best_bucket(h, w, options):
+    min_metric = float('inf')
+    best_bucket = None
+    for (bucket_h, bucket_w) in options:
+        metric = abs(h * bucket_w - w * bucket_h)
+        if metric <= min_metric:
+            min_metric = metric
+            best_bucket = (bucket_h, bucket_w)
+    return best_bucket
+@torch.inference_mode()
+def encode_cropped_prompt_77tokens(txt: str):
+    memory_management.load_models_to_gpu(text_encoder)
+    cond_ids = tokenizer(txt,
+                         padding="max_length",
+                         max_length=tokenizer.model_max_length,
+                         truncation=True,
+                         return_tensors="pt").input_ids.to(device=text_encoder.device)
+    text_cond = text_encoder(cond_ids, attention_mask=None).last_hidden_state
+    return text_cond
+@torch.inference_mode()
+def pytorch2numpy(imgs):
+    results = []
+    for x in imgs:
+        y = x.movedim(0, -1)
+        y = y * 127.5 + 127.5
+        y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
+        results.append(y)
+    return results
+@torch.inference_mode()
+def numpy2pytorch(imgs):
+    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
+    h = h.movedim(-1, 1)
+    return h
+def resize_without_crop(image, target_width, target_height):
+    pil_image = Image.fromarray(image)
+    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
+    return np.array(resized_image)
+@torch.inference_mode()
+def interrogator_process(x):
+    return wd14tagger.default_interrogator(x)
+@torch.inference_mode()
+def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
+            progress=gr.Progress()):
+    rng = torch.Generator(device=memory_management.gpu).manual_seed(int(seed))
+    memory_management.load_models_to_gpu(vae)
+    fg = resize_and_center_crop(input_fg, image_width, image_height)
+    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
+    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
+    memory_management.load_models_to_gpu(text_encoder)
+    conds = encode_cropped_prompt_77tokens(prompt)
+    unconds = encode_cropped_prompt_77tokens(n_prompt)
+    memory_management.load_models_to_gpu(unet)
+    fs = torch.tensor(input_undo_steps).to(device=unet.device, dtype=torch.long)
+    initial_latents = torch.zeros_like(concat_conds)
+    concat_conds = concat_conds.to(device=unet.device, dtype=unet.dtype)
+    latents = k_sampler(
+        initial_latent=initial_latents,
+        strength=1.0,
+        num_inference_steps=steps,
+        guidance_scale=cfg,
+        batch_size=len(input_undo_steps),
+        generator=rng,
+        prompt_embeds=conds,
+        negative_prompt_embeds=unconds,
+        cross_attention_kwargs={'concat_conds': concat_conds, 'coded_conds': fs},
+        same_noise_in_batch=True,
+        progress_tqdm=functools.partial(progress.tqdm, desc='Generating Key Frames')
+    ).to(vae.dtype) / vae.config.scaling_factor
+    memory_management.load_models_to_gpu(vae)
+    pixels = vae.decode(latents).sample
+    pixels = pytorch2numpy(pixels)
+    pixels = [fg] + pixels + [np.zeros_like(fg) + 255]
+    return pixels
+@torch.inference_mode()
+def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=7.5, fs=3, progress_tqdm=None):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    frames = 16
+    target_height, target_width = find_best_bucket(
+        image_1.shape[0], image_1.shape[1],
+        options=[(320, 512), (384, 448), (448, 384), (512, 320)]
+    )
+    image_1 = resize_and_center_crop(image_1, target_width=target_width, target_height=target_height)
+    image_2 = resize_and_center_crop(image_2, target_width=target_width, target_height=target_height)
+    input_frames = numpy2pytorch([image_1, image_2])
+    input_frames = input_frames.unsqueeze(0).movedim(1, 2)
+    memory_management.load_models_to_gpu(video_pipe.text_encoder)
+    positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
+    negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
+    memory_management.load_models_to_gpu([video_pipe.image_projection, video_pipe.image_encoder])
+    input_frames = input_frames.to(device=video_pipe.image_encoder.device, dtype=video_pipe.image_encoder.dtype)
+    positive_image_cond = video_pipe.encode_clip_vision(input_frames)
+    positive_image_cond = video_pipe.image_projection(positive_image_cond)
+    negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
+    negative_image_cond = video_pipe.image_projection(negative_image_cond)
+    memory_management.load_models_to_gpu([video_pipe.vae])
+    input_frames = input_frames.to(device=video_pipe.vae.device, dtype=video_pipe.vae.dtype)
+    input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
+    first_frame = input_frame_latents[:, :, 0]
+    last_frame = input_frame_latents[:, :, 1]
+    concat_cond = torch.stack([first_frame] + [torch.zeros_like(first_frame)] * (frames - 2) + [last_frame], dim=2)
+    memory_management.load_models_to_gpu([video_pipe.unet])
+    latents = video_pipe(
+        batch_size=1,
+        steps=int(steps),
+        guidance_scale=cfg_scale,
+        positive_text_cond=positive_text_cond,
+        negative_text_cond=negative_text_cond,
+        positive_image_cond=positive_image_cond,
+        negative_image_cond=negative_image_cond,
+        concat_cond=concat_cond,
+        fs=fs,
+        progress_tqdm=progress_tqdm
+    )
+    memory_management.load_models_to_gpu([video_pipe.vae])
+    video = video_pipe.decode_latents(latents, vae_hidden_states)
+    return video, image_1, image_2
+@torch.inference_mode()
+def process_video(keyframes, prompt, steps, cfg, fps, seed, progress=gr.Progress()):
+    result_frames = []
+    cropped_images = []
+    for i, (im1, im2) in enumerate(zip(keyframes[:-1], keyframes[1:])):
+        im1 = np.array(Image.open(im1[0]))
+        im2 = np.array(Image.open(im2[0]))
+        frames, im1, im2 = process_video_inner(
+            im1, im2, prompt, seed=seed + i, steps=steps, cfg_scale=cfg, fs=3,
+            progress_tqdm=functools.partial(progress.tqdm, desc=f'Generating Videos ({i + 1}/{len(keyframes) - 1})')
+        )
+        result_frames.append(frames[:, :, :-1, :, :])
+        cropped_images.append([im1, im2])
+    video = torch.cat(result_frames, dim=2)
+    video = torch.flip(video, dims=[2])
+    uuid_name = str(uuid.uuid4())
+    output_filename = os.path.join(result_dir, uuid_name + '.mp4')
+    Image.fromarray(cropped_images[0][0]).save(os.path.join(result_dir, uuid_name + '.png'))
+    video = save_bcthw_as_mp4(video, output_filename, fps=fps)
+    video = [x.cpu().numpy() for x in video]
+    return output_filename, video
+block = gr.Blocks().queue()
+with block:
+    gr.Markdown('# Paints-Undo')
+    with gr.Accordion(label='Step 1: Upload Image and Generate Prompt', open=True):
+        with gr.Row():
+            with gr.Column():
+                input_fg = gr.Image(sources=['upload'], type="numpy", label="Image", height=512)
+            with gr.Column():
+                prompt_gen_button = gr.Button(value="Generate Prompt", interactive=False)
+                prompt = gr.Textbox(label="Output Prompt", interactive=True)
+    with gr.Accordion(label='Step 2: Generate Key Frames', open=True):
+        with gr.Row():
+            with gr.Column():
+                input_undo_steps = gr.Dropdown(label="Operation Steps", value=[400, 600, 800, 900, 950, 999],
+                                               choices=list(range(1000)), multiselect=True)
+                seed = gr.Slider(label='Stage 1 Seed', minimum=0, maximum=50000, step=1, value=12345)
+                image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64)
+                image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64)
+                steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=50, step=1)
+                cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=3.0, step=0.01)
+                n_prompt = gr.Textbox(label="Negative Prompt",
+                                      value='lowres, bad anatomy, bad hands, cropped, worst quality')
+            with gr.Column():
+                key_gen_button = gr.Button(value="Generate Key Frames", interactive=False)
+                result_gallery = gr.Gallery(height=512, object_fit='contain', label='Outputs', columns=4)
+    with gr.Accordion(label='Step 3: Generate All Videos', open=True):
+        with gr.Row():
+            with gr.Column():
+                i2v_input_text = gr.Text(label='Prompts', value='1girl, masterpiece, best quality')
+                i2v_seed = gr.Slider(label='Stage 2 Seed', minimum=0, maximum=50000, step=1, value=123)
+                i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5,
+                                          elem_id="i2v_cfg_scale")
+                i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps",
+                                      label="Sampling steps", value=50)
+                i2v_fps = gr.Slider(minimum=1, maximum=30, step=1, elem_id="i2v_motion", label="FPS", value=4)
+            with gr.Column():
+                i2v_end_btn = gr.Button("Generate Video", interactive=False)
+                i2v_output_video = gr.Video(label="Generated Video", elem_id="output_vid", autoplay=True,
+                                            show_share_button=True, height=512)
+        with gr.Row():
+            i2v_output_images = gr.Gallery(height=512, label="Output Frames", object_fit="contain", columns=8)
+    input_fg.change(lambda: ["", gr.update(interactive=True), gr.update(interactive=False), gr.update(interactive=False)],
+                    outputs=[prompt, prompt_gen_button, key_gen_button, i2v_end_btn])
+    prompt_gen_button.click(
+        fn=interrogator_process,
+        inputs=[input_fg],
+        outputs=[prompt]
+    ).then(lambda: [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=False)],
+           outputs=[prompt_gen_button, key_gen_button, i2v_end_btn])
+    key_gen_button.click(
+        fn=process,
+        inputs=[input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg],
+        outputs=[result_gallery]
+    ).then(lambda: [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)],
+           outputs=[prompt_gen_button, key_gen_button, i2v_end_btn])
+    i2v_end_btn.click(
+        inputs=[result_gallery, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_fps, i2v_seed],
+        outputs=[i2v_output_video, i2v_output_images],
+        fn=process_video
+    )
+    dbs = [
+        ['./imgs/1.jpg', 12345, 123],
+        ['./imgs/2.jpg', 37000, 12345],
+        ['./imgs/3.jpg', 3000, 3000],
+    ]
+    gr.Examples(
+        examples=dbs,
+        inputs=[input_fg, seed, i2v_seed],
+        examples_per_page=1024
+    )
+block.queue().launch(server_name='0.0.0.0')

imgs/1.jpg ADDED Viewed

imgs/2.jpg ADDED Viewed

imgs/3.jpg ADDED Viewed

memory_management.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+from contextlib import contextmanager
+high_vram = False
+gpu = torch.device('cuda')
+cpu = torch.device('cpu')
+torch.zeros((1, 1)).to(gpu, torch.float32)
+torch.cuda.empty_cache()
+models_in_gpu = []
+@contextmanager
+def movable_bnb_model(m):
+    if hasattr(m, 'quantization_method'):
+        m.quantization_method_backup = m.quantization_method
+        del m.quantization_method
+    try:
+        yield None
+    finally:
+        if hasattr(m, 'quantization_method_backup'):
+            m.quantization_method = m.quantization_method_backup
+            del m.quantization_method_backup
+    return
+def load_models_to_gpu(models):
+    global models_in_gpu
+    if not isinstance(models, (tuple, list)):
+        models = [models]
+    models_to_remain = [m for m in set(models) if m in models_in_gpu]
+    models_to_load = [m for m in set(models) if m not in models_in_gpu]
+    models_to_unload = [m for m in set(models_in_gpu) if m not in models_to_remain]
+    if not high_vram:
+        for m in models_to_unload:
+            with movable_bnb_model(m):
+                m.to(cpu)
+            print('Unload to CPU:', m.__class__.__name__)
+        models_in_gpu = models_to_remain
+    for m in models_to_load:
+        with movable_bnb_model(m):
+            m.to(gpu)
+        print('Load to GPU:', m.__class__.__name__)
+    models_in_gpu = list(set(models_in_gpu + models))
+    torch.cuda.empty_cache()
+    return
+def unload_all_models(extra_models=None):
+    global models_in_gpu
+    if extra_models is None:
+        extra_models = []
+    if not isinstance(extra_models, (tuple, list)):
+        extra_models = [extra_models]
+    models_in_gpu = list(set(models_in_gpu + extra_models))
+    return load_models_to_gpu([])

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+diffusers==0.28.0
+transformers==4.41.1
+gradio==4.31.5
+bitsandbytes==0.43.1
+accelerate==0.30.1
+protobuf==3.20
+opencv-python
+tensorboardX
+safetensors
+pillow
+einops
+torch
+peft
+xformers
+onnxruntime
+av

wd14tagger.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# https://huggingface.co/spaces/SmilingWolf/wd-v1-4-tags
+import os
+import csv
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+from onnxruntime import InferenceSession
+from torch.hub import download_url_to_file
+global_model = None
+global_csv = None
+def download_model(url, local_path):
+    if os.path.exists(local_path):
+        return local_path
+    temp_path = local_path + '.tmp'
+    download_url_to_file(url=url, dst=temp_path)
+    os.rename(temp_path, local_path)
+    return local_path
+def default_interrogator(image, threshold=0.35, character_threshold=0.85, exclude_tags=""):
+    global global_model, global_csv
+    model_name = "wd-v1-4-moat-tagger-v2"
+    model_onnx_filename = download_model(
+        url=f'https://huggingface.co/lllyasviel/misc/resolve/main/{model_name}.onnx',
+        local_path=f'./{model_name}.onnx',
+    )
+    model_csv_filename = download_model(
+        url=f'https://huggingface.co/lllyasviel/misc/resolve/main/{model_name}.csv',
+        local_path=f'./{model_name}.csv',
+    )
+    if global_model is not None:
+        model = global_model
+    else:
+        # assert 'CUDAExecutionProvider' in ort.get_available_providers(), 'CUDA Install Failed!'
+        # model = InferenceSession(model_onnx_filename, providers=['CUDAExecutionProvider'])
+        model = InferenceSession(model_onnx_filename, providers=['CPUExecutionProvider'])
+        global_model = model
+    input = model.get_inputs()[0]
+    height = input.shape[1]
+    if isinstance(image, str):
+        image = Image.open(image)  # RGB
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    else:
+        image = image
+    ratio = float(height) / max(image.size)
+    new_size = tuple([int(x*ratio) for x in image.size])
+    image = image.resize(new_size, Image.LANCZOS)
+    square = Image.new("RGB", (height, height), (255, 255, 255))
+    square.paste(image, ((height-new_size[0])//2, (height-new_size[1])//2))
+    image = np.array(square).astype(np.float32)
+    image = image[:, :, ::-1]  # RGB -> BGR
+    image = np.expand_dims(image, 0)
+    if global_csv is not None:
+        csv_lines = global_csv
+    else:
+        csv_lines = []
+        with open(model_csv_filename) as f:
+            reader = csv.reader(f)
+            next(reader)
+            for row in reader:
+                csv_lines.append(row)
+        global_csv = csv_lines
+    tags = []
+    general_index = None
+    character_index = None
+    for line_num, row in enumerate(csv_lines):
+        if general_index is None and row[2] == "0":
+            general_index = line_num
+        elif character_index is None and row[2] == "4":
+            character_index = line_num
+        tags.append(row[1])
+    label_name = model.get_outputs()[0].name
+    probs = model.run([label_name], {input.name: image})[0]
+    result = list(zip(tags, probs[0]))
+    general = [item for item in result[general_index:character_index] if item[1] > threshold]
+    character = [item for item in result[character_index:] if item[1] > character_threshold]
+    all = character + general
+    remove = [s.strip() for s in exclude_tags.lower().split(",")]
+    all = [tag for tag in all if tag[0] not in remove]
+    res = ", ".join((item[0].replace("(", "\\(").replace(")", "\\)") for item in all)).replace('_', ' ')
+    return res