diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..b1f5e1192e45608a613c85c7b9b384ea47843dcc --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +* text=auto eol=lf +Cargo.lock linguist-generated=false diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000000000000000000000000000000000..52f14570e752f391177918d550765e894e8c53ca --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,13 @@ + + +* Closes #ISSUE_NUMBER diff --git a/.github/workflows/labels.yml b/.github/workflows/labels.yml new file mode 100644 index 0000000000000000000000000000000000000000..d1f4cf5660622bf54a0894aba057b59eb687ea8e --- /dev/null +++ b/.github/workflows/labels.yml @@ -0,0 +1,27 @@ +# Copied from https://github.com/rerun-io/rerun_template + +# https://github.com/marketplace/actions/require-labels +# Check for existence of labels +# See all our labels at https://github.com/rerun-io/rerun/issues/labels + +name: PR Labels + +on: + pull_request: + types: + - opened + - synchronize + - reopened + - labeled + - unlabeled + +jobs: + label: + runs-on: ubuntu-latest + steps: + - name: Check for a "do-not-merge" label + uses: mheap/github-action-required-labels@v3 + with: + mode: exactly + count: 0 + labels: "do-not-merge" diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml new file mode 100644 index 0000000000000000000000000000000000000000..8bb984c0bbae9fbf6476d4038b734a41351b7c3c --- /dev/null +++ b/.github/workflows/links.yml @@ -0,0 +1,29 @@ +# Copied from https://github.com/rerun-io/rerun_template +on: [push, pull_request] + +name: Link checker + +jobs: + link-checker: + name: Check links + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Restore link checker cache + uses: actions/cache@v3 + with: + path: .lycheecache + key: cache-lychee-${{ github.sha }} + restore-keys: cache-lychee- + + # Check https://github.com/lycheeverse/lychee on how to run locally. + - name: Link Checker + id: lychee + uses: lycheeverse/lychee-action@v1.9.0 + with: + fail: true + lycheeVersion: "0.14.3" + # When given a directory, lychee checks only markdown, html and text files, everything else we have to glob in manually. + args: | + --base . --cache --max-cache-age 1d . "**/*.rs" "**/*.toml" "**/*.hpp" "**/*.cpp" "**/CMakeLists.txt" "**/*.py" "**/*.yml" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml new file mode 100644 index 0000000000000000000000000000000000000000..f1e3bafb802235e2f359b270da4355f9aea9e616 --- /dev/null +++ b/.github/workflows/python.yml @@ -0,0 +1,21 @@ +# Copied from https://github.com/rerun-io/rerun_template +# Disabled since this contains a lot of non-conforming code from the original repository +on: [] + +name: C++ + +jobs: + python-check: + name: Python + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: prefix-dev/setup-pixi@v0.5.2 + with: + pixi-version: v0.19.0 + cache: true + + - run: pixi run py-fmt-check + + - run: pixi run py-lint diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml new file mode 100644 index 0000000000000000000000000000000000000000..3055f873e48968bc50b801406f2996f9b8ecccb3 --- /dev/null +++ b/.github/workflows/typos.yml @@ -0,0 +1,19 @@ +# Copied from https://github.com/rerun-io/rerun_template + +# https://github.com/crate-ci/typos +# Add exceptions to `.typos.toml` +# install and run locally: cargo install typos-cli && typos + +name: Spell Check +on: [pull_request] + +jobs: + run: + name: Spell Check + runs-on: ubuntu-latest + steps: + - name: Checkout Actions Repository + uses: actions/checkout@v4 + + - name: Check spelling of entire workspace + uses: crate-ci/typos@master diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..ca13e4940bd077b82a04a8ae2f1efabc3397158a --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# Mac stuff: +.DS_Store + +# C++ build directory +build + +# Rust compile target directories: +target +target_ra +target_wasm + +# https://github.com/lycheeverse/lychee +.lycheecache + +# Pixi environment +.pixi + +# Python stuff: +__pycache__ +.mypy_cache +.ruff_cache +venv +.python-version diff --git a/.mypy.ini b/.mypy.ini new file mode 100644 index 0000000000000000000000000000000000000000..4980320bac80f0870c6899a92715fd9dcfa132ac --- /dev/null +++ b/.mypy.ini @@ -0,0 +1,11 @@ +[mypy] +files = . +exclude = build +namespace_packages = True +show_error_codes = True +strict = True +enable_error_code = redundant-expr, truthy-bool, ignore-without-code +; plugins = numpy.typing.mypy_plugin +ignore_missing_imports = True +no_implicit_reexport = False +disallow_untyped_calls = False diff --git a/.typos.toml b/.typos.toml new file mode 100644 index 0000000000000000000000000000000000000000..aeeedafe182faa915127a01547c4f73b51359e5b --- /dev/null +++ b/.typos.toml @@ -0,0 +1,6 @@ +# https://github.com/crate-ci/typos +# install: cargo install typos-cli +# run: typos + +[default.extend-words] +teh = "teh" # part of @teh-cmc diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000000000000000000000000000000000000..f15bc7d665a48ea9d9a21dc316117388bbbbb7c1 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,26 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=827846 + // for the documentation about the extensions.json format + "recommendations": [ + "charliermarsh.ruff", + "gaborv.flatbuffers", + "github.vscode-github-actions", + "josetr.cmake-language-support-vscode", + "ms-python.mypy-type-checker", + "ms-python.python", + "ms-vscode.cmake-tools", + "ms-vscode.cpptools-extension-pack", + "ms-vsliveshare.vsliveshare", + "polymeilex.wgsl", + "rust-lang.rust-analyzer", + "serayuzgur.crates", + "streetsidesoftware.code-spell-checker", + "tamasfe.even-better-toml", + "vadimcn.vscode-lldb", + "wayou.vscode-todo-highlight", + "webfreak.debug", + "xaver.clang-format", // C++ formatter + "zxh404.vscode-proto3", + "esbenp.prettier-vscode" + ] +} diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000000000000000000000000000000000000..18dac88142792f18df793abeebc6d33e19d00d0f --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,56 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + // Python + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + }, + // Rust: + { + "name": "Debug 'PROJ_NAME'", + "type": "lldb", + "request": "launch", + "cargo": { + "args": [ + "build" + ], + "filter": { + "name": "PROJ_NAME", + "kind": "bin" + } + }, + "args": [], + "cwd": "${workspaceFolder}", + "env": { + "RUST_LOG": "debug" + } + }, + { + "name": "Launch Rust tests", + "type": "lldb", + "request": "launch", + "cargo": { + "args": [ + "test", + "--no-run", + "--lib", + "--all-features" + ], + "filter": { + "kind": "lib" + } + }, + "cwd": "${workspaceFolder}", + "env": { + "RUST_LOG": "debug" + } + }, + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000000000000000000000000000000000..afce5d7cb66a2867fd9b6cb5da5323b6b6d4d9d3 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,52 @@ +{ + "editor.formatOnSave": true, + "editor.semanticTokenColorCustomizations": { + "rules": { + "*.unsafe:rust": "#eb5046" + } + }, + "files.autoGuessEncoding": true, + "files.insertFinalNewline": true, + "files.trimTrailingWhitespace": true, + // don't share a cargo lock with rust-analyzer. + // see https://github.com/rerun-io/rerun/pull/519 for rationale + "rust-analyzer.check.overrideCommand": [ + "cargo", + "clippy", + "--target-dir=target_ra", + "--workspace", + "--message-format=json", + "--all-targets", + "--all-features" + ], + "rust-analyzer.cargo.buildScripts.overrideCommand": [ + "cargo", + "check", + "--quiet", + "--target-dir=target_ra", + "--workspace", + "--message-format=json", + "--all-targets", + "--all-features", + ], + // Our build scripts are generating code. + // Having Rust Analyzer do this while doing other builds can lead to catastrophic failures. + // INCLUDING attempts to publish a new release! + "rust-analyzer.cargo.buildScripts.enable": false, + "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", // Use cmake-tools to grab configs. + "C_Cpp.autoAddFileAssociations": false, + "cmake.buildDirectory": "${workspaceRoot}/build/debug", + "cmake.generator": "Ninja", // Use Ninja, just like we do in our just/pixi command. + "rust-analyzer.showUnlinkedFileNotification": false, + "ruff.format.args": [ + "--config=pyproject.toml" + ], + "ruff.lint.args": [ + "--config=pyproject.toml" + ], + "prettier.requireConfig": true, + "prettier.configPath": ".prettierrc.toml", + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff" + }, +} diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..1fbf07ca34c3ac0b88fdfde2f045f678769d52f6 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,132 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +opensource@rerun.io. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000000000000000000000000000000000000..11069edd79019f7dafbe3138841cf289209270dd --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000000000000000000000000000000000000..3f6d1ede3c9d2ff700f95d5de076fc1b73f25112 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2024 Rerun Technologies AB + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8ec24480c42be6c49ced2472c6f2871e7b9131e6 --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +--- +title: Vista +emoji: 🚗 +colorFrom: blue +colorTo: green +sdk: gradio +sdk_version: 4.36.1 +app_file: app.py +pinned: false +license: apache-2.0 +--- + +# Vista: A Generalizable Driving World Model with High Fidelity and Versatile Controllability + +https://github.com/rerun-io/hf-example-vista/assets/9785832/0b9a01ca-90a2-4b36-98fc-a7a7b378fd54 + +[Shenyuan Gao](https://github.com/Little-Podi), [Jiazhi Yang](https://scholar.google.com/citations?user=Ju7nGX8AAAAJ&hl=en), [Li Chen](https://scholar.google.com/citations?user=ulZxvY0AAAAJ&hl=en), [Kashyap Chitta](https://kashyap7x.github.io/), [Yihang Qiu](https://scholar.google.com/citations?user=qgRUOdIAAAAJ&hl=en), [Andreas Geiger](https://www.cvlibs.net/), [Jun Zhang](https://eejzhang.people.ust.hk/), [Hongyang Li](https://lihongyang.info/) + +This is a demo of the [Vista model](https://github.com/OpenDriveLab/Vista), a driving world model that can be used to simulate a variety of driving scenarios. This demo uses [Rerun](https://rerun.io/)'s custom [gradio component](https://www.gradio.app/custom-components/gallery?id=radames%2Fgradio_rerun) to livestream the model's output and show intermediate results. + +[📜technical report](https://arxiv.org/abs/2405.17398), [🎬video demos](https://vista-demo.github.io/), [🤗model weights](https://huggingface.co/OpenDriveLab/Vista) + +Please refer to the [original repository](https://github.com/OpenDriveLab/Vista) for the original code base and README. + +You can try the example on Rerun's HuggingFace space [here](https://huggingface.co/spaces/rerun/Vista). + +## Run the example locally +To run this example locally use the following command (you need a GPU with at least 20GB of memory, tested with an RTX 4090): +```bash +pixi run example +``` + +You can specify the first image, the number of generated segments, and the number of diffusion steps per segment: +```bash +pixi run example --img-path "example_images/streetview.png" --num-segments 10 --num-steps 100 +``` + +To see other all options, use the following: +```bash +pixi run example --help +``` diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..111f7b3422cf6598484af6ecb480fc5e27a56ea6 --- /dev/null +++ b/app.py @@ -0,0 +1,127 @@ +"""Gradio interface for Vista model.""" +from __future__ import annotations + +import glob +import os +import queue +import threading + +import gradio as gr +import gradio_rerun +import rerun as rr +import spaces + +import vista + + +@spaces.GPU(duration=400) +@rr.thread_local_stream("Vista") +def generate_gradio( + first_frame_file_name: str, + n_rounds: float=3, + n_steps: float=10, + height=576, + width=1024, + n_frames=25, + cfg_scale=2.5, + cond_aug=0.0, +): + global model + + n_rounds = int(n_rounds) + n_steps = int(n_steps) + + # Use a queue to log immediately from internals + log_queue = queue.SimpleQueue() + + stream = rr.binary_stream() + + blueprint = vista.generate_blueprint(n_rounds) + rr.send_blueprint(blueprint) + yield stream.read() + + handle = threading.Thread( + target=vista.run_sampling, + args=[ + log_queue, + first_frame_file_name, + height, + width, + n_rounds, + n_frames, + n_steps, + cfg_scale, + cond_aug, + model, + ], + ) + handle.start() + while True: + msg = log_queue.get() + if msg == "done": + break + else: + entity_path, entity, times = msg + rr.reset_time() + for timeline, time in times: + if isinstance(time, int): + rr.set_time_sequence(timeline, time) + else: + rr.set_time_seconds(timeline, time) + rr.log(entity_path, entity) + yield stream.read() + handle.join() + + +model = vista.create_model() + +with gr.Blocks(css="style.css") as demo: + gr.Markdown( + """ + # Vista: A Generalizable Driving World Model with High Fidelity and Versatile Controllability + + [Shenyuan Gao](https://github.com/Little-Podi), [Jiazhi Yang](https://scholar.google.com/citations?user=Ju7nGX8AAAAJ&hl=en), [Li Chen](https://scholar.google.com/citations?user=ulZxvY0AAAAJ&hl=en), [Kashyap Chitta](https://kashyap7x.github.io/), [Yihang Qiu](https://scholar.google.com/citations?user=qgRUOdIAAAAJ&hl=en), [Andreas Geiger](https://www.cvlibs.net/), [Jun Zhang](https://eejzhang.people.ust.hk/), [Hongyang Li](https://lihongyang.info/) + + This is a demo of the [Vista model](https://github.com/OpenDriveLab/Vista), a driving world model that can be used to simulate a variety of driving scenarios. This demo uses [Rerun](https://rerun.io/)'s custom [gradio component](https://www.gradio.app/custom-components/gallery?id=radames%2Fgradio_rerun) to livestream the model's output and show intermediate results. + + [📜technical report](https://arxiv.org/abs/2405.17398), [🎬video demos](https://vista-demo.github.io/), [🤗model weights](https://huggingface.co/OpenDriveLab/Vista) + + Note that the GPU time is limited to 400 seconds per run. If you need more time, you can run the model locally or on your own server. + """ + ) + first_frame = gr.Image(sources="upload", type="filepath") + example_dir_path = os.path.join(os.path.dirname(__file__), "example_images") + example_file_paths = sorted(glob.glob(os.path.join(example_dir_path, "*.*"))) + example_gallery = gr.Examples( + examples=example_file_paths, + inputs=first_frame, + cache_examples=False, + ) + + btn = gr.Button("Generate video") + num_rounds = gr.Slider( + label="Segments", + info="Number of 25 frame segments to generate. Higher values lead to longer videos. Try to keep the product of segments and steps below 30 to avoid running out of time.", + minimum=1, + maximum=5, + value=2, + step=1 + ) + num_steps = gr.Slider( + label="Diffusion Steps", + info="Number of diffusion steps per segment. Higher values lead to more detailed videos. Try to keep the product of segments and steps below 30 to avoid running out of time.", + minimum=1, + maximum=50, + value=15, + step=1 + ) + + with gr.Row(): + viewer = gradio_rerun.Rerun(streaming=True) + btn.click( + generate_gradio, + inputs=[first_frame, num_rounds, num_steps], + outputs=[viewer], + ) + +demo.launch() diff --git a/example_images/nus-0.jpg b/example_images/nus-0.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bebe2b2d7b05539b0f8a677b8fdd0e63fa4a73bf Binary files /dev/null and b/example_images/nus-0.jpg differ diff --git a/example_images/nus-1.jpg b/example_images/nus-1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..89a0126443ed7c78d05bb11611c57cfcc636cdad Binary files /dev/null and b/example_images/nus-1.jpg differ diff --git a/example_images/nus-2.jpg b/example_images/nus-2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5123fabb2f954bc80222036e66408126f32ad142 Binary files /dev/null and b/example_images/nus-2.jpg differ diff --git a/example_images/nus-3.jpg b/example_images/nus-3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c05af4d04d617505f88f5fd9477af25f5075bf31 Binary files /dev/null and b/example_images/nus-3.jpg differ diff --git a/example_images/nus-4.jpg b/example_images/nus-4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..098ebe9ea97854a93d4cc6af4f56f6994cc4f16e Binary files /dev/null and b/example_images/nus-4.jpg differ diff --git a/example_images/streetview.jpg b/example_images/streetview.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6d97d916860b6379e1bb95330e4fd70a3138e631 Binary files /dev/null and b/example_images/streetview.jpg differ diff --git a/lychee.toml b/lychee.toml new file mode 100644 index 0000000000000000000000000000000000000000..212d70e3b1ade7f502a49608c87e02432e897e77 --- /dev/null +++ b/lychee.toml @@ -0,0 +1,82 @@ +# Copied from https://github.com/rerun-io/rerun_template + +################################################################################ +# Config for the link checker lychee. +# +# Download & learn more at: +# https://github.com/lycheeverse/lychee +# +# Example config: +# https://github.com/lycheeverse/lychee/blob/master/lychee.example.toml +# +# Run `lychee . --dump` to list all found links that are being checked. +# +# Note that by default lychee will only check markdown and html files, +# to check any other files you have to point to them explicitly, e.g.: +# `lychee **/*.rs` +# To make things worse, `exclude_path` is ignored for these globs, +# so local runs with lots of gitignored files will be slow. +# (https://github.com/lycheeverse/lychee/issues/1405) +# +# This unfortunately doesn't list anything for non-glob checks. +################################################################################ + +# Maximum number of concurrent link checks. +# Workaround for "too many open files" error on MacOS, see https://github.com/lycheeverse/lychee/issues/1248 +max_concurrency = 32 + +# Check links inside `` and `
` blocks as well as Markdown code blocks.
+include_verbatim = true
+
+# Proceed for server connections considered insecure (invalid TLS).
+insecure = true
+
+# Exclude these filesystem paths from getting checked.
+exclude_path = [
+  # Unfortunately lychee doesn't yet read .gitignore https://github.com/lycheeverse/lychee/issues/1331
+  # The following entries are there because of that:
+  ".git",
+  "__pycache__",
+  "_deps/",
+  ".pixi",
+  "build",
+  "target_ra",
+  "target_wasm",
+  "target",
+  "venv",
+]
+
+# Exclude URLs and mail addresses from checking (supports regex).
+exclude = [
+  # Skip speculative links
+  '.*?speculative-link',
+
+  # Strings with replacements.
+  '/__VIEWER_VERSION__/', # Replacement variable __VIEWER_VERSION__.
+  '/\$',                  # Replacement variable $.
+  '/GIT_HASH/',           # Replacement variable GIT_HASH.
+  '\{\}',                 # Ignore links with string interpolation.
+  '\$relpath\^',          # Relative paths as used by rerun_cpp's doc header.
+  '%7B.+%7D',             # Ignore strings that look like ready to use links but contain a replacement strings. The URL escaping is for '{.+}' (this seems to be needed for html embedded urls since lychee assumes they use this encoding).
+  '%7B%7D',               # Ignore links with string interpolation, escaped variant.
+
+  # Local links that require further setup.
+  'http://127.0.0.1',
+  'http://localhost',
+  'recording:/',      # rrd recording link.
+  'ws:/',
+  're_viewer.js',     # Build artifact that html is linking to.
+
+  # Api endpoints.
+  'https://fonts.googleapis.com/', # Font API entrypoint, not a link.
+  'https://fonts.gstatic.com/',    # Font API entrypoint, not a link.
+  'https://tel.rerun.io/',         # Analytics endpoint.
+
+  # Avoid rate limiting.
+  'https://crates.io/crates/.*',                  # Avoid crates.io rate-limiting
+  'https://github.com/rerun-io/rerun/commit/\.*', # Ignore links to our own commits (typically in changelog).
+  'https://github.com/rerun-io/rerun/pull/\.*',   # Ignore links to our own pull requests (typically in changelog).
+
+  # Used in rerun_template repo until the user search-replaces `new_repo_name`
+  'https://github.com/rerun-io/new_repo_name',
+]
diff --git a/main.py b/main.py
new file mode 100755
index 0000000000000000000000000000000000000000..0526a0c3571e2868e8fa7cc0db73a8c17212a249
--- /dev/null
+++ b/main.py
@@ -0,0 +1,87 @@
+"""Command line interface for generating videos from the model."""
+from __future__ import annotations
+
+import argparse
+import queue
+import threading
+
+import rerun as rr
+
+import vista
+
+
+def generate_local(
+    first_frame_file_name: str,
+    height=576,
+    width=1024,
+    n_rounds=4,
+    n_frames=25,
+    n_steps=10,
+    cfg_scale=2.5,
+    cond_aug=0.0,
+):
+    # Use a queue to log immediately from internals
+    log_queue = queue.SimpleQueue()
+
+    handle = threading.Thread(
+        target=vista.run_sampling,
+        args=[
+            log_queue,
+            first_frame_file_name,
+            height,
+            width,
+            n_rounds,
+            n_frames,
+            n_steps,
+            cfg_scale,
+            cond_aug,
+        ],
+    )
+    handle.start()
+    while True:
+        msg = log_queue.get()
+        if msg == "done":
+            break
+        else:
+            entity_path, entity, times = msg
+            rr.reset_time()
+            for timeline, time in times:
+                if isinstance(time, int):
+                    rr.set_time_sequence(timeline, time)
+                else:
+                    rr.set_time_seconds(timeline, time)
+            rr.log(entity_path, entity)
+    handle.join()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate video conditioned on a single image using the Vista model."
+    )
+    parser.add_argument(
+        "--img-path",
+        type=str,
+        help="Path to image used as input for Canny edge detector.",
+        default="./example_images/nus-0.jpg",
+    )
+    parser.add_argument(
+        "--num-steps",
+        type=int,
+        help="Number of diffusion steps per image. Recommended range: 10-50. Higher values result in more detailed images and less blurry results.",
+        default=20,
+    )
+    parser.add_argument(
+        "--num-segments",
+        type=int,
+        help="Number of segments to generate. Each segment consists of 25 frames.",
+        default=3,
+    )
+    rr.script_add_args(parser)
+    args = parser.parse_args()
+    rr.script_setup(
+        args,
+        "rerun_example_vista",
+        default_blueprint=vista.generate_blueprint(args.num_segments),
+    )
+
+    generate_local(args.img_path, n_steps=args.num_steps, n_rounds=args.num_segments)
diff --git a/pixi.lock b/pixi.lock
new file mode 100644
index 0000000000000000000000000000000000000000..55d8af28f805d67a06178ec30b58f4979efe09e8
--- /dev/null
+++ b/pixi.lock
@@ -0,0 +1,2130 @@
+version: 5
+environments:
+  default:
+    channels:
+    - url: https://conda.anaconda.org/conda-forge/
+    packages:
+      linux-64:
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py310hc6cd4ac_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/mypy-1.8.0-py310h2372a71_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.0.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4.20240210-h59595ed_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.1-hd590300_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/psutil-6.0.0-py310hc51659f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha2e5f31_6.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.10.14-hd12c33a_0_cpython.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-4_cp310.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ruff-0.3.7-py310h3d77a66_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/types-requests-2.31.0.20240406-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.11.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/typos-1.20.9-he8a937b_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.2.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2
+      linux-aarch64:
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-python-1.1.0-py310hbb3657e_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h31becfc_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ca-certificates-2024.2.2-hcefe29a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.40-h2d8c526_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.4.2-h3557bc0_5.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-13.2.0-hf8544c7_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-13.2.0-hf8544c7_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libnsl-2.0.1-h31becfc_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.45.2-h194ca79_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-ng-13.2.0-h9a76618_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.38.1-hb4cce97_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxcrypt-4.4.36-h31becfc_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.2.13-h31becfc_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mypy-1.8.0-py310hb299538_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.0.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.4.20240210-h0425590_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.2.1-h31becfc_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/psutil-6.0.0-py310hb52b2da_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha2e5f31_6.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.10.14-hbbe8eec_0_cpython.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python_abi-3.10-4_cp310.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.2-h8fc344f_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ruff-0.3.7-py310hf6424b7_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-h194ca79_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/types-requests-2.31.0.20240406-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.11.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/typos-1.20.9-h1d8f897_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.2.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xz-5.2.6-h9cdd2b7_0.tar.bz2
+      osx-64:
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/brotli-python-1.1.0-py310h9e9d8ca_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h10d778d_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.2.2-h8857fd0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/libcxx-16.0.6-hd57cbcb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.45.2-h92b6c6a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.2.13-h8a1eda9_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/mypy-1.8.0-py310hb372a2b_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.0.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.4.20240210-h73e2aa4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.2.1-hd75f5a5_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/psutil-6.0.0-py310h936d840_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha2e5f31_6.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/python-3.10.14-h00d2728_0_cpython.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/python_abi-3.10-4_cp310.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/ruff-0.3.7-py310hdac29b7_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/types-requests-2.31.0.20240406-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.11.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/typos-1.20.9-h11a7dfb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.2.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2
+      osx-arm64:
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/brotli-python-1.1.0-py310h1253130_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h93a5062_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.2.2-hf0a4a13_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libcxx-16.0.6-h4653b0c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.2-h3422bc3_5.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.45.2-h091b4b1_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.2.13-h53f4e23_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/mypy-1.8.0-py310hd125d64_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.0.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.4.20240210-h078ce10_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.2.1-h0d3ecfb_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/psutil-6.0.0-py310ha6dd24b_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha2e5f31_6.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.10.14-h2469fbe_0_cpython.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python_abi-3.10-4_cp310.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ruff-0.3.7-py310h81561d7_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/types-requests-2.31.0.20240406-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.11.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/typos-1.20.9-h5ef7bb8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.2.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2
+      win-64:
+      - conda: https://conda.anaconda.org/conda-forge/win-64/brotli-python-1.1.0-py310h00ffb61_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-hcfcfb64_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.2.2-h56e8100_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.2-h8ffe710_5.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.45.2-hcfcfb64_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.2.13-hcfcfb64_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libgfortran-5.3.0-6.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-5.3.0-7.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-core-5.3.0-7.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-gmp-6.1.0-2.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-libwinpthread-git-5.0.0.4634.697f757-2.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/win-64/msys2-conda-epoch-20160418-1.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/win-64/mypy-1.8.0-py310h8d17308_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.0.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.2.1-hcfcfb64_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/psutil-6.0.0-py310ha8f682b_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh0701188_6.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.10.14-h4de0772_0_cpython.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/python_abi-3.10-4_cp310.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/ruff-0.3.7-py310h298983d_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/types-requests-2.31.0.20240406-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.11.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/typos-1.20.9-h7f3b576_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.2.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-hcf57466_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.38.33130-h82b7239_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.38.33130-hcb4865c_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/win_inet_pton-1.1.0-pyhd8ed1ab_6.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2
+packages:
+- kind: conda
+  name: _libgcc_mutex
+  version: '0.1'
+  build: conda_forge
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
+  sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726
+  md5: d7c89558ba9fa0495403155b64376d81
+  license: None
+  size: 2562
+  timestamp: 1578324546067
+- kind: conda
+  name: _openmp_mutex
+  version: '4.5'
+  build: 2_gnu
+  build_number: 16
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2
+  sha256: fbe2c5e56a653bebb982eda4876a9178aedfc2b545f25d0ce9c4c0b508253d22
+  md5: 73aaf86a425cc6e73fcf236a5a46396d
+  depends:
+  - _libgcc_mutex 0.1 conda_forge
+  - libgomp >=7.5.0
+  constrains:
+  - openmp_impl 9999
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 23621
+  timestamp: 1650670423406
+- kind: conda
+  name: _openmp_mutex
+  version: '4.5'
+  build: 2_gnu
+  build_number: 16
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2
+  sha256: 3702bef2f0a4d38bd8288bbe54aace623602a1343c2cfbefd3fa188e015bebf0
+  md5: 6168d71addc746e8f2b8d57dfd2edcea
+  depends:
+  - libgomp >=7.5.0
+  constrains:
+  - openmp_impl 9999
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 23712
+  timestamp: 1650670790230
+- kind: conda
+  name: brotli-python
+  version: 1.1.0
+  build: py310h00ffb61_1
+  build_number: 1
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/brotli-python-1.1.0-py310h00ffb61_1.conda
+  sha256: 8de77cf62a653dd6ffe19927b92c421f5fa73c078d7799181f5211a1bac2883b
+  md5: 42bfbc1d41cbe2696a3c9d8b0342324f
+  depends:
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  - ucrt >=10.0.20348.0
+  - vc >=14.2,<15
+  - vc14_runtime >=14.29.30139
+  constrains:
+  - libbrotlicommon 1.1.0 hcfcfb64_1
+  license: MIT
+  license_family: MIT
+  size: 321672
+  timestamp: 1695990897641
+- kind: conda
+  name: brotli-python
+  version: 1.1.0
+  build: py310h1253130_1
+  build_number: 1
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/brotli-python-1.1.0-py310h1253130_1.conda
+  sha256: dab21e18c0275bfd93a09b751096998485677ed17c2e2d08298bc5b43c10bee1
+  md5: 26fab7f65a80fff9f402ec3b7860b88a
+  depends:
+  - libcxx >=15.0.7
+  - python >=3.10,<3.11.0a0
+  - python >=3.10,<3.11.0a0 *_cpython
+  - python_abi 3.10.* *_cp310
+  constrains:
+  - libbrotlicommon 1.1.0 hb547adb_1
+  license: MIT
+  license_family: MIT
+  size: 344275
+  timestamp: 1695990848681
+- kind: conda
+  name: brotli-python
+  version: 1.1.0
+  build: py310h9e9d8ca_1
+  build_number: 1
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/brotli-python-1.1.0-py310h9e9d8ca_1.conda
+  sha256: 57d66ca3e072b889c94cfaf56eb7e1794d3b1b3179bd475a4edef50a03359354
+  md5: 2362e323293e7699cf1e621d502f86d6
+  depends:
+  - libcxx >=15.0.7
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  constrains:
+  - libbrotlicommon 1.1.0 h0dc2134_1
+  license: MIT
+  license_family: MIT
+  size: 367037
+  timestamp: 1695990378635
+- kind: conda
+  name: brotli-python
+  version: 1.1.0
+  build: py310hbb3657e_1
+  build_number: 1
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-python-1.1.0-py310hbb3657e_1.conda
+  sha256: 192f2537ca30953653375abd851f1ccf8b849988e3195e0189aa47384a3a88d9
+  md5: 5ed52d1d3c480022fe67ae00d1cab792
+  depends:
+  - libgcc-ng >=12
+  - libstdcxx-ng >=12
+  - python >=3.10,<3.11.0a0
+  - python >=3.10,<3.11.0a0 *_cpython
+  - python_abi 3.10.* *_cp310
+  constrains:
+  - libbrotlicommon 1.1.0 h31becfc_1
+  license: MIT
+  license_family: MIT
+  size: 355646
+  timestamp: 1695990521531
+- kind: conda
+  name: brotli-python
+  version: 1.1.0
+  build: py310hc6cd4ac_1
+  build_number: 1
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py310hc6cd4ac_1.conda
+  sha256: e22268d81905338570786921b3def88e55f9ed6d0ccdd17d9fbae31a02fbef69
+  md5: 1f95722c94f00b69af69a066c7433714
+  depends:
+  - libgcc-ng >=12
+  - libstdcxx-ng >=12
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  constrains:
+  - libbrotlicommon 1.1.0 hd590300_1
+  license: MIT
+  license_family: MIT
+  size: 349397
+  timestamp: 1695990295884
+- kind: conda
+  name: bzip2
+  version: 1.0.8
+  build: h10d778d_5
+  build_number: 5
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h10d778d_5.conda
+  sha256: 61fb2b488928a54d9472113e1280b468a309561caa54f33825a3593da390b242
+  md5: 6097a6ca9ada32699b5fc4312dd6ef18
+  license: bzip2-1.0.6
+  license_family: BSD
+  size: 127885
+  timestamp: 1699280178474
+- kind: conda
+  name: bzip2
+  version: 1.0.8
+  build: h31becfc_5
+  build_number: 5
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h31becfc_5.conda
+  sha256: b9f170990625cb1eeefaca02e091dc009a64264b077166d8ed7aeb7a09e923b0
+  md5: a64e35f01e0b7a2a152eca87d33b9c87
+  depends:
+  - libgcc-ng >=12
+  license: bzip2-1.0.6
+  license_family: BSD
+  size: 189668
+  timestamp: 1699280060686
+- kind: conda
+  name: bzip2
+  version: 1.0.8
+  build: h93a5062_5
+  build_number: 5
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h93a5062_5.conda
+  sha256: bfa84296a638bea78a8bb29abc493ee95f2a0218775642474a840411b950fe5f
+  md5: 1bbc659ca658bfd49a481b5ef7a0f40f
+  license: bzip2-1.0.6
+  license_family: BSD
+  size: 122325
+  timestamp: 1699280294368
+- kind: conda
+  name: bzip2
+  version: 1.0.8
+  build: hcfcfb64_5
+  build_number: 5
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-hcfcfb64_5.conda
+  sha256: ae5f47a5c86fd6db822931255dcf017eb12f60c77f07dc782ccb477f7808aab2
+  md5: 26eb8ca6ea332b675e11704cce84a3be
+  depends:
+  - ucrt >=10.0.20348.0
+  - vc >=14.2,<15
+  - vc14_runtime >=14.29.30139
+  license: bzip2-1.0.6
+  license_family: BSD
+  size: 124580
+  timestamp: 1699280668742
+- kind: conda
+  name: bzip2
+  version: 1.0.8
+  build: hd590300_5
+  build_number: 5
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda
+  sha256: 242c0c324507ee172c0e0dd2045814e746bb303d1eb78870d182ceb0abc726a8
+  md5: 69b8b6202a07720f448be700e300ccf4
+  depends:
+  - libgcc-ng >=12
+  license: bzip2-1.0.6
+  license_family: BSD
+  size: 254228
+  timestamp: 1699279927352
+- kind: conda
+  name: ca-certificates
+  version: 2024.2.2
+  build: h56e8100_0
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2024.2.2-h56e8100_0.conda
+  sha256: 4d587088ecccd393fec3420b64f1af4ee1a0e6897a45cfd5ef38055322cea5d0
+  md5: 63da060240ab8087b60d1357051ea7d6
+  license: ISC
+  size: 155886
+  timestamp: 1706843918052
+- kind: conda
+  name: ca-certificates
+  version: 2024.2.2
+  build: h8857fd0_0
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.2.2-h8857fd0_0.conda
+  sha256: 54a794aedbb4796afeabdf54287b06b1d27f7b13b3814520925f4c2c80f58ca9
+  md5: f2eacee8c33c43692f1ccfd33d0f50b1
+  license: ISC
+  size: 155665
+  timestamp: 1706843838227
+- kind: conda
+  name: ca-certificates
+  version: 2024.2.2
+  build: hbcca054_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda
+  sha256: 91d81bfecdbb142c15066df70cc952590ae8991670198f92c66b62019b251aeb
+  md5: 2f4327a1cbe7f022401b236e915a5fef
+  license: ISC
+  size: 155432
+  timestamp: 1706843687645
+- kind: conda
+  name: ca-certificates
+  version: 2024.2.2
+  build: hcefe29a_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/ca-certificates-2024.2.2-hcefe29a_0.conda
+  sha256: 0f6b34d835e26e5fa97cca4985dc46f0aba551a3a23f07c6f13cca2542b8c642
+  md5: 57c226edb90c4e973b9b7503537dd339
+  license: ISC
+  size: 155738
+  timestamp: 1706845723412
+- kind: conda
+  name: ca-certificates
+  version: 2024.2.2
+  build: hf0a4a13_0
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/ca-certificates-2024.2.2-hf0a4a13_0.conda
+  sha256: 49bc3439816ac72d0c0e0f144b8cc870fdcc4adec2e861407ec818d8116b2204
+  md5: fb416a1795f18dcc5a038bc2dc54edf9
+  license: ISC
+  size: 155725
+  timestamp: 1706844034242
+- kind: conda
+  name: ld_impl_linux-64
+  version: '2.40'
+  build: h41732ed_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda
+  sha256: f6cc89d887555912d6c61b295d398cff9ec982a3417d38025c45d5dd9b9e79cd
+  md5: 7aca3059a1729aa76c597603f10b0dd3
+  constrains:
+  - binutils_impl_linux-64 2.40
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 704696
+  timestamp: 1674833944779
+- kind: conda
+  name: ld_impl_linux-aarch64
+  version: '2.40'
+  build: h2d8c526_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.40-h2d8c526_0.conda
+  sha256: 1ba06e8645094b340b4aee23603a6abb1b0383788180e65f3de34e655c5f577c
+  md5: 16246d69e945d0b1969a6099e7c5d457
+  constrains:
+  - binutils_impl_linux-aarch64 2.40
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 738776
+  timestamp: 1674833843183
+- kind: conda
+  name: libcxx
+  version: 16.0.6
+  build: h4653b0c_0
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/libcxx-16.0.6-h4653b0c_0.conda
+  sha256: 11d3fb51c14832d9e4f6d84080a375dec21ea8a3a381a1910e67ff9cedc20355
+  md5: 9d7d724faf0413bf1dbc5a85935700c8
+  license: Apache-2.0 WITH LLVM-exception
+  license_family: Apache
+  size: 1160232
+  timestamp: 1686896993785
+- kind: conda
+  name: libcxx
+  version: 16.0.6
+  build: hd57cbcb_0
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/libcxx-16.0.6-hd57cbcb_0.conda
+  sha256: 9063271847cf05f3a6cc6cae3e7f0ced032ab5f3a3c9d3f943f876f39c5c2549
+  md5: 7d6972792161077908b62971802f289a
+  license: Apache-2.0 WITH LLVM-exception
+  license_family: Apache
+  size: 1142172
+  timestamp: 1686896907750
+- kind: conda
+  name: libffi
+  version: 3.4.2
+  build: h0d85af4_5
+  build_number: 5
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2
+  sha256: 7a2d27a936ceee6942ea4d397f9c7d136f12549d86f7617e8b6bad51e01a941f
+  md5: ccb34fb14960ad8b125962d3d79b31a9
+  license: MIT
+  license_family: MIT
+  size: 51348
+  timestamp: 1636488394370
+- kind: conda
+  name: libffi
+  version: 3.4.2
+  build: h3422bc3_5
+  build_number: 5
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/libffi-3.4.2-h3422bc3_5.tar.bz2
+  sha256: 41b3d13efb775e340e4dba549ab5c029611ea6918703096b2eaa9c015c0750ca
+  md5: 086914b672be056eb70fd4285b6783b6
+  license: MIT
+  license_family: MIT
+  size: 39020
+  timestamp: 1636488587153
+- kind: conda
+  name: libffi
+  version: 3.4.2
+  build: h3557bc0_5
+  build_number: 5
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.4.2-h3557bc0_5.tar.bz2
+  sha256: 7e9258a102480757fe3faeb225a3ca04dffd10fecd2a958c65cdb4cdf75f2c3c
+  md5: dddd85f4d52121fab0a8b099c5e06501
+  depends:
+  - libgcc-ng >=9.4.0
+  license: MIT
+  license_family: MIT
+  size: 59450
+  timestamp: 1636488255090
+- kind: conda
+  name: libffi
+  version: 3.4.2
+  build: h7f98852_5
+  build_number: 5
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2
+  sha256: ab6e9856c21709b7b517e940ae7028ae0737546122f83c2aa5d692860c3b149e
+  md5: d645c6d2ac96843a2bfaccd2d62b3ac3
+  depends:
+  - libgcc-ng >=9.4.0
+  license: MIT
+  license_family: MIT
+  size: 58292
+  timestamp: 1636488182923
+- kind: conda
+  name: libffi
+  version: 3.4.2
+  build: h8ffe710_5
+  build_number: 5
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.2-h8ffe710_5.tar.bz2
+  sha256: 1951ab740f80660e9bc07d2ed3aefb874d78c107264fd810f24a1a6211d4b1a5
+  md5: 2c96d1b6915b408893f9472569dee135
+  depends:
+  - vc >=14.1,<15.0a0
+  - vs2015_runtime >=14.16.27012
+  license: MIT
+  license_family: MIT
+  size: 42063
+  timestamp: 1636489106777
+- kind: conda
+  name: libgcc-ng
+  version: 13.2.0
+  build: h807b86a_5
+  build_number: 5
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda
+  sha256: d32f78bfaac282cfe5205f46d558704ad737b8dbf71f9227788a5ca80facaba4
+  md5: d4ff227c46917d3b4565302a2bbb276b
+  depends:
+  - _libgcc_mutex 0.1 conda_forge
+  - _openmp_mutex >=4.5
+  constrains:
+  - libgomp 13.2.0 h807b86a_5
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 770506
+  timestamp: 1706819192021
+- kind: conda
+  name: libgcc-ng
+  version: 13.2.0
+  build: hf8544c7_5
+  build_number: 5
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-13.2.0-hf8544c7_5.conda
+  sha256: 869e44e1cf329198f5bea56c146207ed639b24b6281187159435b9499ecb3959
+  md5: dee934e640275d9e74e7bbd455f25162
+  depends:
+  - _openmp_mutex >=4.5
+  constrains:
+  - libgomp 13.2.0 hf8544c7_5
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 456795
+  timestamp: 1706820691781
+- kind: conda
+  name: libgomp
+  version: 13.2.0
+  build: h807b86a_5
+  build_number: 5
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_5.conda
+  sha256: 0d3d4b1b0134283ea02d58e8eb5accf3655464cf7159abf098cc694002f8d34e
+  md5: d211c42b9ce49aee3734fdc828731689
+  depends:
+  - _libgcc_mutex 0.1 conda_forge
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 419751
+  timestamp: 1706819107383
+- kind: conda
+  name: libgomp
+  version: 13.2.0
+  build: hf8544c7_5
+  build_number: 5
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-13.2.0-hf8544c7_5.conda
+  sha256: a98d4f242a351feb7983a28e7d6a0ca51da764c6233ea3dfc776975a3aba8a01
+  md5: 379be2f115ffb73860e4e260dd2170b7
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 423091
+  timestamp: 1706820564165
+- kind: conda
+  name: libnsl
+  version: 2.0.1
+  build: h31becfc_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/libnsl-2.0.1-h31becfc_0.conda
+  sha256: fd18c2b75d7411096428d36a70b36b1a17e31f7b8956b6905d145792d49e97f8
+  md5: c14f32510f694e3185704d89967ec422
+  depends:
+  - libgcc-ng >=12
+  license: LGPL-2.1-only
+  license_family: GPL
+  size: 34501
+  timestamp: 1697358973269
+- kind: conda
+  name: libnsl
+  version: 2.0.1
+  build: hd590300_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda
+  sha256: 26d77a3bb4dceeedc2a41bd688564fe71bf2d149fdcf117049970bc02ff1add6
+  md5: 30fd6e37fe21f86f4bd26d6ee73eeec7
+  depends:
+  - libgcc-ng >=12
+  license: LGPL-2.1-only
+  license_family: GPL
+  size: 33408
+  timestamp: 1697359010159
+- kind: conda
+  name: libsqlite
+  version: 3.45.2
+  build: h091b4b1_0
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.45.2-h091b4b1_0.conda
+  sha256: 7c234320a1a2132b9cc972aaa06bb215bb220a5b1addb0bed7a5a321c805920e
+  md5: 9d07427ee5bd9afd1e11ce14368a48d6
+  depends:
+  - libzlib >=1.2.13,<1.3.0a0
+  license: Unlicense
+  size: 825300
+  timestamp: 1710255078823
+- kind: conda
+  name: libsqlite
+  version: 3.45.2
+  build: h194ca79_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.45.2-h194ca79_0.conda
+  sha256: 0ce6de6369c04386cfc8696b1f795f425843789609ae2e04e7a1eb7deae62a8b
+  md5: bf4c96a21fbfc6a6ef6a7781a534a4e0
+  depends:
+  - libgcc-ng >=12
+  - libzlib >=1.2.13,<1.3.0a0
+  license: Unlicense
+  size: 1038462
+  timestamp: 1710253998432
+- kind: conda
+  name: libsqlite
+  version: 3.45.2
+  build: h2797004_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.conda
+  sha256: 8cdbeb7902729e319510a82d7c642402981818702b58812af265ef55d1315473
+  md5: 866983a220e27a80cb75e85cb30466a1
+  depends:
+  - libgcc-ng >=12
+  - libzlib >=1.2.13,<1.3.0a0
+  license: Unlicense
+  size: 857489
+  timestamp: 1710254744982
+- kind: conda
+  name: libsqlite
+  version: 3.45.2
+  build: h92b6c6a_0
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.45.2-h92b6c6a_0.conda
+  sha256: 320ec73a4e3dd377757a2595770b8137ec4583df4d7782472d76377cdbdc4543
+  md5: 086f56e13a96a6cfb1bf640505ae6b70
+  depends:
+  - libzlib >=1.2.13,<1.3.0a0
+  license: Unlicense
+  size: 902355
+  timestamp: 1710254991672
+- kind: conda
+  name: libsqlite
+  version: 3.45.2
+  build: hcfcfb64_0
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.45.2-hcfcfb64_0.conda
+  sha256: 4bb24b986550275a6d02835150d943c4c675808d05c0efc5c2a22154d007a69f
+  md5: f95359f8dc5abf7da7776ece9ef10bc5
+  depends:
+  - ucrt >=10.0.20348.0
+  - vc >=14.2,<15
+  - vc14_runtime >=14.29.30139
+  license: Unlicense
+  size: 869606
+  timestamp: 1710255095740
+- kind: conda
+  name: libstdcxx-ng
+  version: 13.2.0
+  build: h7e041cc_5
+  build_number: 5
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_5.conda
+  sha256: a56c5b11f1e73a86e120e6141a42d9e935a99a2098491ac9e15347a1476ce777
+  md5: f6f6600d18a4047b54f803cf708b868a
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 3834139
+  timestamp: 1706819252496
+- kind: conda
+  name: libstdcxx-ng
+  version: 13.2.0
+  build: h9a76618_5
+  build_number: 5
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-ng-13.2.0-h9a76618_5.conda
+  sha256: c209f23a8a497fc87107a68b6bbc8d2089cf15fd4015b558dfdce63544379b05
+  md5: 1b79d37dce0fad96bdf3de03925f43b4
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 3752658
+  timestamp: 1706820778418
+- kind: conda
+  name: libuuid
+  version: 2.38.1
+  build: h0b41bf4_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda
+  sha256: 787eb542f055a2b3de553614b25f09eefb0a0931b0c87dbcce6efdfd92f04f18
+  md5: 40b61aab5c7ba9ff276c41cfffe6b80b
+  depends:
+  - libgcc-ng >=12
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 33601
+  timestamp: 1680112270483
+- kind: conda
+  name: libuuid
+  version: 2.38.1
+  build: hb4cce97_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.38.1-hb4cce97_0.conda
+  sha256: 616277b0c5f7616c2cdf36f6c316ea3f9aa5bb35f2d4476a349ab58b9b91675f
+  md5: 000e30b09db0b7c775b21695dff30969
+  depends:
+  - libgcc-ng >=12
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 35720
+  timestamp: 1680113474501
+- kind: conda
+  name: libxcrypt
+  version: 4.4.36
+  build: h31becfc_1
+  build_number: 1
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/libxcrypt-4.4.36-h31becfc_1.conda
+  sha256: 6b46c397644091b8a26a3048636d10b989b1bf266d4be5e9474bf763f828f41f
+  md5: b4df5d7d4b63579d081fd3a4cf99740e
+  depends:
+  - libgcc-ng >=12
+  license: LGPL-2.1-or-later
+  size: 114269
+  timestamp: 1702724369203
+- kind: conda
+  name: libxcrypt
+  version: 4.4.36
+  build: hd590300_1
+  build_number: 1
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda
+  sha256: 6ae68e0b86423ef188196fff6207ed0c8195dd84273cb5623b85aa08033a410c
+  md5: 5aa797f8787fe7a17d1b0821485b5adc
+  depends:
+  - libgcc-ng >=12
+  license: LGPL-2.1-or-later
+  size: 100393
+  timestamp: 1702724383534
+- kind: conda
+  name: libzlib
+  version: 1.2.13
+  build: h31becfc_5
+  build_number: 5
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.2.13-h31becfc_5.conda
+  sha256: aeeefbb61e5e8227e53566d5e42dbb49e120eb99109996bf0dbfde8f180747a7
+  md5: b213aa87eea9491ef7b129179322e955
+  depends:
+  - libgcc-ng >=12
+  constrains:
+  - zlib 1.2.13 *_5
+  license: Zlib
+  license_family: Other
+  size: 67036
+  timestamp: 1686575148440
+- kind: conda
+  name: libzlib
+  version: 1.2.13
+  build: h53f4e23_5
+  build_number: 5
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.2.13-h53f4e23_5.conda
+  sha256: ab1c8aefa2d54322a63aaeeefe9cf877411851738616c4068e0dccc66b9c758a
+  md5: 1a47f5236db2e06a320ffa0392f81bd8
+  constrains:
+  - zlib 1.2.13 *_5
+  license: Zlib
+  license_family: Other
+  size: 48102
+  timestamp: 1686575426584
+- kind: conda
+  name: libzlib
+  version: 1.2.13
+  build: h8a1eda9_5
+  build_number: 5
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.2.13-h8a1eda9_5.conda
+  sha256: fc58ad7f47ffea10df1f2165369978fba0a1cc32594aad778f5eec725f334867
+  md5: 4a3ad23f6e16f99c04e166767193d700
+  constrains:
+  - zlib 1.2.13 *_5
+  license: Zlib
+  license_family: Other
+  size: 59404
+  timestamp: 1686575566695
+- kind: conda
+  name: libzlib
+  version: 1.2.13
+  build: hcfcfb64_5
+  build_number: 5
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.2.13-hcfcfb64_5.conda
+  sha256: c161822ee8130b71e08b6d282b9919c1de2c5274b29921a867bca0f7d30cad26
+  md5: 5fdb9c6a113b6b6cb5e517fd972d5f41
+  depends:
+  - ucrt >=10.0.20348.0
+  - vc >=14.2,<15
+  - vc14_runtime >=14.29.30139
+  constrains:
+  - zlib 1.2.13 *_5
+  license: Zlib
+  license_family: Other
+  size: 55800
+  timestamp: 1686575452215
+- kind: conda
+  name: libzlib
+  version: 1.2.13
+  build: hd590300_5
+  build_number: 5
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda
+  sha256: 370c7c5893b737596fd6ca0d9190c9715d89d888b8c88537ae1ef168c25e82e4
+  md5: f36c115f1ee199da648e0597ec2047ad
+  depends:
+  - libgcc-ng >=12
+  constrains:
+  - zlib 1.2.13 *_5
+  license: Zlib
+  license_family: Other
+  size: 61588
+  timestamp: 1686575217516
+- kind: conda
+  name: m2w64-gcc-libgfortran
+  version: 5.3.0
+  build: '6'
+  build_number: 6
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libgfortran-5.3.0-6.tar.bz2
+  sha256: 9de95a7996d5366ae0808eef2acbc63f9b11b874aa42375f55379e6715845dc6
+  md5: 066552ac6b907ec6d72c0ddab29050dc
+  depends:
+  - m2w64-gcc-libs-core
+  - msys2-conda-epoch ==20160418
+  license: GPL, LGPL, FDL, custom
+  size: 350687
+  timestamp: 1608163451316
+- kind: conda
+  name: m2w64-gcc-libs
+  version: 5.3.0
+  build: '7'
+  build_number: 7
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-5.3.0-7.tar.bz2
+  sha256: 3bd1ab02b7c89a5b153a17be03b36d833f1517ff2a6a77ead7c4a808b88196aa
+  md5: fe759119b8b3bfa720b8762c6fdc35de
+  depends:
+  - m2w64-gcc-libgfortran
+  - m2w64-gcc-libs-core
+  - m2w64-gmp
+  - m2w64-libwinpthread-git
+  - msys2-conda-epoch ==20160418
+  license: GPL3+, partial:GCCRLE, partial:LGPL2+
+  size: 532390
+  timestamp: 1608163512830
+- kind: conda
+  name: m2w64-gcc-libs-core
+  version: 5.3.0
+  build: '7'
+  build_number: 7
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-core-5.3.0-7.tar.bz2
+  sha256: 58afdfe859ed2e9a9b1cc06bc408720cb2c3a6a132e59d4805b090d7574f4ee0
+  md5: 4289d80fb4d272f1f3b56cfe87ac90bd
+  depends:
+  - m2w64-gmp
+  - m2w64-libwinpthread-git
+  - msys2-conda-epoch ==20160418
+  license: GPL3+, partial:GCCRLE, partial:LGPL2+
+  size: 219240
+  timestamp: 1608163481341
+- kind: conda
+  name: m2w64-gmp
+  version: 6.1.0
+  build: '2'
+  build_number: 2
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/m2w64-gmp-6.1.0-2.tar.bz2
+  sha256: 7e3cd95f554660de45f8323fca359e904e8d203efaf07a4d311e46d611481ed1
+  md5: 53a1c73e1e3d185516d7e3af177596d9
+  depends:
+  - msys2-conda-epoch ==20160418
+  license: LGPL3
+  size: 743501
+  timestamp: 1608163782057
+- kind: conda
+  name: m2w64-libwinpthread-git
+  version: 5.0.0.4634.697f757
+  build: '2'
+  build_number: 2
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/m2w64-libwinpthread-git-5.0.0.4634.697f757-2.tar.bz2
+  sha256: f63a09b2cae7defae0480f1740015d6235f1861afa6fe2e2d3e10bd0d1314ee0
+  md5: 774130a326dee16f1ceb05cc687ee4f0
+  depends:
+  - msys2-conda-epoch ==20160418
+  license: MIT, BSD
+  size: 31928
+  timestamp: 1608166099896
+- kind: conda
+  name: msys2-conda-epoch
+  version: '20160418'
+  build: '1'
+  build_number: 1
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/msys2-conda-epoch-20160418-1.tar.bz2
+  sha256: 99358d58d778abee4dca82ad29fb58058571f19b0f86138363c260049d4ac7f1
+  md5: b0309b72560df66f71a9d5e34a5efdfa
+  size: 3227
+  timestamp: 1608166968312
+- kind: conda
+  name: mypy
+  version: 1.8.0
+  build: py310h2372a71_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/mypy-1.8.0-py310h2372a71_0.conda
+  sha256: 6c01268327db83c70c38cfc87fc13a71d09cda123ae06cd6edbbe620c2b20f33
+  md5: 3320dc32fc6bd29ab4a16cf22bc35fc2
+  depends:
+  - libgcc-ng >=12
+  - mypy_extensions >=1.0.0
+  - psutil >=4.0
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  - tomli >=1.1.0
+  - typing_extensions >=4.1.0
+  license: MIT
+  license_family: MIT
+  size: 17160046
+  timestamp: 1703185152663
+- kind: conda
+  name: mypy
+  version: 1.8.0
+  build: py310h8d17308_0
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/mypy-1.8.0-py310h8d17308_0.conda
+  sha256: 8ca9e638a538225b6a5a935573964fa4d743456ece6171988d4116d57a635069
+  md5: 42c9adc3e138cd581a869d46dfdb3fcd
+  depends:
+  - mypy_extensions >=1.0.0
+  - psutil >=4.0
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  - tomli >=1.1.0
+  - typing_extensions >=4.1.0
+  - ucrt >=10.0.20348.0
+  - vc >=14.2,<15
+  - vc14_runtime >=14.29.30139
+  license: MIT
+  license_family: MIT
+  size: 9332107
+  timestamp: 1703185142866
+- kind: conda
+  name: mypy
+  version: 1.8.0
+  build: py310hb299538_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/mypy-1.8.0-py310hb299538_0.conda
+  sha256: a970f3c8d22641c7202a6b40c106a55e8b77b0fe4cca715c95ca551262cdebea
+  md5: bf64efdd960ab39f9cd99f562f7ad27f
+  depends:
+  - libgcc-ng >=12
+  - mypy_extensions >=1.0.0
+  - psutil >=4.0
+  - python >=3.10,<3.11.0a0
+  - python >=3.10,<3.11.0a0 *_cpython
+  - python_abi 3.10.* *_cp310
+  - tomli >=1.1.0
+  - typing_extensions >=4.1.0
+  license: MIT
+  license_family: MIT
+  size: 14537904
+  timestamp: 1703185330563
+- kind: conda
+  name: mypy
+  version: 1.8.0
+  build: py310hb372a2b_0
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/mypy-1.8.0-py310hb372a2b_0.conda
+  sha256: 3af15d65a207840b15b3298398f70e48263a20706d2bc48bede09f9077597759
+  md5: 5e8c2a9af839d3d23be1cf7e2c955c5c
+  depends:
+  - mypy_extensions >=1.0.0
+  - psutil >=4.0
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  - tomli >=1.1.0
+  - typing_extensions >=4.1.0
+  license: MIT
+  license_family: MIT
+  size: 11303456
+  timestamp: 1703184930605
+- kind: conda
+  name: mypy
+  version: 1.8.0
+  build: py310hd125d64_0
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/mypy-1.8.0-py310hd125d64_0.conda
+  sha256: 630f3bccefb8b7139dcee0941fb241664130953f319885134f8cd813d4e300ce
+  md5: 0884b51650eb1e4f77022ffd68819ce6
+  depends:
+  - mypy_extensions >=1.0.0
+  - psutil >=4.0
+  - python >=3.10,<3.11.0a0
+  - python >=3.10,<3.11.0a0 *_cpython
+  - python_abi 3.10.* *_cp310
+  - tomli >=1.1.0
+  - typing_extensions >=4.1.0
+  license: MIT
+  license_family: MIT
+  size: 8941826
+  timestamp: 1703185223331
+- kind: conda
+  name: mypy_extensions
+  version: 1.0.0
+  build: pyha770c72_0
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.0.0-pyha770c72_0.conda
+  sha256: f240217476e148e825420c6bc3a0c0efb08c0718b7042fae960400c02af858a3
+  md5: 4eccaeba205f0aed9ac3a9ea58568ca3
+  depends:
+  - python >=3.5
+  license: MIT
+  license_family: MIT
+  size: 10492
+  timestamp: 1675543414256
+- kind: conda
+  name: ncurses
+  version: 6.4.20240210
+  build: h0425590_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.4.20240210-h0425590_0.conda
+  sha256: 4223dc34e2bddd37bf995158ae481e00be375b287d539bc7a0532634c0fc63b7
+  md5: c1a1612ddaee95c83abfa0b2ec858626
+  depends:
+  - libgcc-ng >=12
+  license: X11 AND BSD-3-Clause
+  size: 926594
+  timestamp: 1710866633409
+- kind: conda
+  name: ncurses
+  version: 6.4.20240210
+  build: h078ce10_0
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.4.20240210-h078ce10_0.conda
+  sha256: 06f0905791575e2cd3aa961493c56e490b3d82ad9eb49f1c332bd338b0216911
+  md5: 616ae8691e6608527d0071e6766dcb81
+  license: X11 AND BSD-3-Clause
+  size: 820249
+  timestamp: 1710866874348
+- kind: conda
+  name: ncurses
+  version: 6.4.20240210
+  build: h59595ed_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4.20240210-h59595ed_0.conda
+  sha256: aa0f005b6727aac6507317ed490f0904430584fa8ca722657e7f0fb94741de81
+  md5: 97da8860a0da5413c7c98a3b3838a645
+  depends:
+  - libgcc-ng >=12
+  license: X11 AND BSD-3-Clause
+  size: 895669
+  timestamp: 1710866638986
+- kind: conda
+  name: ncurses
+  version: 6.4.20240210
+  build: h73e2aa4_0
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.4.20240210-h73e2aa4_0.conda
+  sha256: 50b72acf08acbc4e5332807653e2ca6b26d4326e8af16fad1fd3f2ce9ea55503
+  md5: 50f28c512e9ad78589e3eab34833f762
+  license: X11 AND BSD-3-Clause
+  size: 823010
+  timestamp: 1710866856626
+- kind: conda
+  name: openssl
+  version: 3.2.1
+  build: h0d3ecfb_1
+  build_number: 1
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.2.1-h0d3ecfb_1.conda
+  sha256: 519dc941d7ab0ebf31a2878d85c2f444450e7c5f6f41c4d07252c6bb3417b78b
+  md5: eb580fb888d93d5d550c557323ac5cee
+  depends:
+  - ca-certificates
+  constrains:
+  - pyopenssl >=22.1
+  license: Apache-2.0
+  license_family: Apache
+  size: 2855250
+  timestamp: 1710793435903
+- kind: conda
+  name: openssl
+  version: 3.2.1
+  build: h31becfc_1
+  build_number: 1
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.2.1-h31becfc_1.conda
+  sha256: 055a26e99ebc12ae0cf23266a0e62e71b59b8ce8cafb1ebb87e375ef9c758d7b
+  md5: e95eb18d256edc72058e0dc9be5338a0
+  depends:
+  - ca-certificates
+  - libgcc-ng >=12
+  constrains:
+  - pyopenssl >=22.1
+  license: Apache-2.0
+  license_family: Apache
+  size: 3380844
+  timestamp: 1710793424665
+- kind: conda
+  name: openssl
+  version: 3.2.1
+  build: hcfcfb64_1
+  build_number: 1
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/openssl-3.2.1-hcfcfb64_1.conda
+  sha256: 61ce4e11c3c26ed4e4d9b7e7e2483121a1741ad0f9c8db0a91a28b6e05182ce6
+  md5: 958e0418e93e50c575bff70fbcaa12d8
+  depends:
+  - ca-certificates
+  - ucrt >=10.0.20348.0
+  - vc >=14.2,<15
+  - vc14_runtime >=14.29.30139
+  constrains:
+  - pyopenssl >=22.1
+  license: Apache-2.0
+  license_family: Apache
+  size: 8230112
+  timestamp: 1710796158475
+- kind: conda
+  name: openssl
+  version: 3.2.1
+  build: hd590300_1
+  build_number: 1
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.1-hd590300_1.conda
+  sha256: 2c689444ed19a603be457284cf2115ee728a3fafb7527326e96054dee7cdc1a7
+  md5: 9d731343cff6ee2e5a25c4a091bf8e2a
+  depends:
+  - ca-certificates
+  - libgcc-ng >=12
+  constrains:
+  - pyopenssl >=22.1
+  license: Apache-2.0
+  license_family: Apache
+  size: 2865379
+  timestamp: 1710793235846
+- kind: conda
+  name: openssl
+  version: 3.2.1
+  build: hd75f5a5_1
+  build_number: 1
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.2.1-hd75f5a5_1.conda
+  sha256: 7ae0ac6a1673584a8a380c2ff3d46eca48ed53bc7174c0d4eaa0dd2f247a0984
+  md5: 570a6f04802df580be529f3a72d2bbf7
+  depends:
+  - ca-certificates
+  constrains:
+  - pyopenssl >=22.1
+  license: Apache-2.0
+  license_family: Apache
+  size: 2506344
+  timestamp: 1710793930515
+- kind: conda
+  name: pip
+  version: '24.0'
+  build: pyhd8ed1ab_0
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda
+  sha256: b7c1c5d8f13e8cb491c4bd1d0d1896a4cf80fc47de01059ad77509112b664a4a
+  md5: f586ac1e56c8638b64f9c8122a7b8a67
+  depends:
+  - python >=3.7
+  - setuptools
+  - wheel
+  license: MIT
+  license_family: MIT
+  size: 1398245
+  timestamp: 1706960660581
+- kind: conda
+  name: psutil
+  version: 6.0.0
+  build: py310h936d840_0
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/psutil-6.0.0-py310h936d840_0.conda
+  sha256: c976819733772f63a1c8e704cb96bf4287c0eb477b10ba467be3adbe5974bf3a
+  md5: 2f5a3bd97ce3176794b59c160ed51fba
+  depends:
+  - __osx >=10.13
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 378205
+  timestamp: 1719274714245
+- kind: conda
+  name: psutil
+  version: 6.0.0
+  build: py310ha6dd24b_0
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/psutil-6.0.0-py310ha6dd24b_0.conda
+  sha256: 7477bd84734668992cda9076147c5d07ce92f59c70441757a5b289401bd8ed85
+  md5: 0e0df689b8c6ea6676b786bd78a575d1
+  depends:
+  - __osx >=11.0
+  - python >=3.10,<3.11.0a0
+  - python >=3.10,<3.11.0a0 *_cpython
+  - python_abi 3.10.* *_cp310
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 379588
+  timestamp: 1719274858964
+- kind: conda
+  name: psutil
+  version: 6.0.0
+  build: py310ha8f682b_0
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/psutil-6.0.0-py310ha8f682b_0.conda
+  sha256: 9801a18aa6fadd3a6286fd89e83fe6affbcb3ca275bb2a00ab0da299d32e92ad
+  md5: 32f5673b7aa2309dda74ccd01822caca
+  depends:
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  - ucrt >=10.0.20348.0
+  - vc >=14.2,<15
+  - vc14_runtime >=14.29.30139
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 388249
+  timestamp: 1719275165312
+- kind: conda
+  name: psutil
+  version: 6.0.0
+  build: py310hb52b2da_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/psutil-6.0.0-py310hb52b2da_0.conda
+  sha256: 8ba0f87bbcbbd9f73c9695522ffe2082f1562a356f59ead8b3d02ab17151fc52
+  md5: a75da26171bc7400ed580382b5c18196
+  depends:
+  - libgcc-ng >=12
+  - python >=3.10,<3.11.0a0
+  - python >=3.10,<3.11.0a0 *_cpython
+  - python_abi 3.10.* *_cp310
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 372806
+  timestamp: 1719274753799
+- kind: conda
+  name: psutil
+  version: 6.0.0
+  build: py310hc51659f_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/psutil-6.0.0-py310hc51659f_0.conda
+  sha256: d23e0a2bf49a752fcc8267484c5eff3e5b267703853c11cc7b4f762412d0f7ef
+  md5: b04405826f96f4eb2f502e642d121bb5
+  depends:
+  - libgcc-ng >=12
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 371633
+  timestamp: 1719274668659
+- kind: conda
+  name: pysocks
+  version: 1.7.1
+  build: pyh0701188_6
+  build_number: 6
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh0701188_6.tar.bz2
+  sha256: b3a612bc887f3dd0fb7c4199ad8e342bd148cf69a9b74fd9468a18cf2bef07b7
+  md5: 56cd9fe388baac0e90c7149cfac95b60
+  depends:
+  - __win
+  - python >=3.8
+  - win_inet_pton
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 19348
+  timestamp: 1661605138291
+- kind: conda
+  name: pysocks
+  version: 1.7.1
+  build: pyha2e5f31_6
+  build_number: 6
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha2e5f31_6.tar.bz2
+  sha256: a42f826e958a8d22e65b3394f437af7332610e43ee313393d1cf143f0a2d274b
+  md5: 2a7de29fb590ca14b5243c4c812c8025
+  depends:
+  - __unix
+  - python >=3.8
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 18981
+  timestamp: 1661604969727
+- kind: conda
+  name: python
+  version: 3.10.14
+  build: h00d2728_0_cpython
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/python-3.10.14-h00d2728_0_cpython.conda
+  sha256: 00c1de2d46ede26609ef4e84a44b83be7876ba6a0215b7c83bff41a0656bf694
+  md5: 0a1cddc4382c5c171e791c70740546dd
+  depends:
+  - bzip2 >=1.0.8,<2.0a0
+  - libffi >=3.4,<4.0a0
+  - libsqlite >=3.45.2,<4.0a0
+  - libzlib >=1.2.13,<2.0.0a0
+  - ncurses >=6.4.20240210,<7.0a0
+  - openssl >=3.2.1,<4.0a0
+  - readline >=8.2,<9.0a0
+  - tk >=8.6.13,<8.7.0a0
+  - tzdata
+  - xz >=5.2.6,<6.0a0
+  constrains:
+  - python_abi 3.10.* *_cp310
+  license: Python-2.0
+  size: 11890228
+  timestamp: 1710940046031
+- kind: conda
+  name: python
+  version: 3.10.14
+  build: h2469fbe_0_cpython
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.10.14-h2469fbe_0_cpython.conda
+  sha256: 454d609fe25daedce9e886efcbfcadad103ed0362e7cb6d2bcddec90b1ecd3ee
+  md5: 4ae999c8227c6d8c7623d32d51d25ea9
+  depends:
+  - bzip2 >=1.0.8,<2.0a0
+  - libffi >=3.4,<4.0a0
+  - libsqlite >=3.45.2,<4.0a0
+  - libzlib >=1.2.13,<2.0.0a0
+  - ncurses >=6.4.20240210,<7.0a0
+  - openssl >=3.2.1,<4.0a0
+  - readline >=8.2,<9.0a0
+  - tk >=8.6.13,<8.7.0a0
+  - tzdata
+  - xz >=5.2.6,<6.0a0
+  constrains:
+  - python_abi 3.10.* *_cp310
+  license: Python-2.0
+  size: 12336005
+  timestamp: 1710939659384
+- kind: conda
+  name: python
+  version: 3.10.14
+  build: h4de0772_0_cpython
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/python-3.10.14-h4de0772_0_cpython.conda
+  sha256: 332f97d9927b65857d6d2d4d50d66dce9b37da81edb67833ae6b88ad52acbd0c
+  md5: 4a00e84f29d1eb418d84970598c444e1
+  depends:
+  - bzip2 >=1.0.8,<2.0a0
+  - libffi >=3.4,<4.0a0
+  - libsqlite >=3.45.2,<4.0a0
+  - libzlib >=1.2.13,<2.0.0a0
+  - openssl >=3.2.1,<4.0a0
+  - tk >=8.6.13,<8.7.0a0
+  - tzdata
+  - vc >=14.1,<15
+  - vc14_runtime >=14.16.27033
+  - xz >=5.2.6,<6.0a0
+  constrains:
+  - python_abi 3.10.* *_cp310
+  license: Python-2.0
+  size: 15864027
+  timestamp: 1710938888352
+- kind: conda
+  name: python
+  version: 3.10.14
+  build: hbbe8eec_0_cpython
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.10.14-hbbe8eec_0_cpython.conda
+  sha256: 992583064b95d256e1b1f03581a51e225a425894d865e35ea2bf3017444c3e84
+  md5: 8a8ee3a8c62032c554debc785a3b5aba
+  depends:
+  - bzip2 >=1.0.8,<2.0a0
+  - ld_impl_linux-aarch64 >=2.36.1
+  - libffi >=3.4,<4.0a0
+  - libgcc-ng >=12
+  - libnsl >=2.0.1,<2.1.0a0
+  - libsqlite >=3.45.2,<4.0a0
+  - libuuid >=2.38.1,<3.0a0
+  - libxcrypt >=4.4.36
+  - libzlib >=1.2.13,<2.0.0a0
+  - ncurses >=6.4.20240210,<7.0a0
+  - openssl >=3.2.1,<4.0a0
+  - readline >=8.2,<9.0a0
+  - tk >=8.6.13,<8.7.0a0
+  - tzdata
+  - xz >=5.2.6,<6.0a0
+  constrains:
+  - python_abi 3.10.* *_cp310
+  license: Python-2.0
+  size: 13116477
+  timestamp: 1710971217224
+- kind: conda
+  name: python
+  version: 3.10.14
+  build: hd12c33a_0_cpython
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/python-3.10.14-hd12c33a_0_cpython.conda
+  sha256: 76a5d12e73542678b70a94570f7b0f7763f9a938f77f0e75d9ea615ef22aa84c
+  md5: 2b4ba962994e8bd4be9ff5b64b75aff2
+  depends:
+  - bzip2 >=1.0.8,<2.0a0
+  - ld_impl_linux-64 >=2.36.1
+  - libffi >=3.4,<4.0a0
+  - libgcc-ng >=12
+  - libnsl >=2.0.1,<2.1.0a0
+  - libsqlite >=3.45.2,<4.0a0
+  - libuuid >=2.38.1,<3.0a0
+  - libxcrypt >=4.4.36
+  - libzlib >=1.2.13,<2.0.0a0
+  - ncurses >=6.4.20240210,<7.0a0
+  - openssl >=3.2.1,<4.0a0
+  - readline >=8.2,<9.0a0
+  - tk >=8.6.13,<8.7.0a0
+  - tzdata
+  - xz >=5.2.6,<6.0a0
+  constrains:
+  - python_abi 3.10.* *_cp310
+  license: Python-2.0
+  size: 25517742
+  timestamp: 1710939725109
+- kind: conda
+  name: python_abi
+  version: '3.10'
+  build: 4_cp310
+  build_number: 4
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-4_cp310.conda
+  sha256: 456bec815bfc2b364763084d08b412fdc4c17eb9ccc66a36cb775fa7ac3cbaec
+  md5: 26322ec5d7712c3ded99dd656142b8ce
+  constrains:
+  - python 3.10.* *_cpython
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 6398
+  timestamp: 1695147363189
+- kind: conda
+  name: python_abi
+  version: '3.10'
+  build: 4_cp310
+  build_number: 4
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/python_abi-3.10-4_cp310.conda
+  sha256: 9191cc3ddf380b655c08b3436a8174ce0cc798a6dfcfa8ee80fa793d0b7165de
+  md5: b0ff2ed109650f9e90d627d3119eb442
+  constrains:
+  - python 3.10.* *_cpython
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 6436
+  timestamp: 1695147402616
+- kind: conda
+  name: python_abi
+  version: '3.10'
+  build: 4_cp310
+  build_number: 4
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/python_abi-3.10-4_cp310.conda
+  sha256: abc26b3b5a62f9c8112a2303d24b0c590d5f7fc9470521f5a520472d59c2223e
+  md5: b15c816c5a86abcc4d1458dd63aa4c65
+  constrains:
+  - python 3.10.* *_cpython
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 6484
+  timestamp: 1695147705581
+- kind: conda
+  name: python_abi
+  version: '3.10'
+  build: 4_cp310
+  build_number: 4
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/python_abi-3.10-4_cp310.conda
+  sha256: f69bac2f28082a275ef67313968b2c366d8236c3a6869b9cdf5cdb97a5821812
+  md5: 1a3d9c6bb5f0b1b22d9e9296c127e8c7
+  constrains:
+  - python 3.10.* *_cpython
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 6490
+  timestamp: 1695147522999
+- kind: conda
+  name: python_abi
+  version: '3.10'
+  build: 4_cp310
+  build_number: 4
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/python_abi-3.10-4_cp310.conda
+  sha256: 19066c462fd0e32c64503c688f77cb603beb4019b812caf855d03f2a5447960b
+  md5: b41195997c14fb7473d26637ea4c3946
+  constrains:
+  - python 3.10.* *_cpython
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 6773
+  timestamp: 1695147715814
+- kind: conda
+  name: readline
+  version: '8.2'
+  build: h8228510_1
+  build_number: 1
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda
+  sha256: 5435cf39d039387fbdc977b0a762357ea909a7694d9528ab40f005e9208744d7
+  md5: 47d31b792659ce70f470b5c82fdfb7a4
+  depends:
+  - libgcc-ng >=12
+  - ncurses >=6.3,<7.0a0
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 281456
+  timestamp: 1679532220005
+- kind: conda
+  name: readline
+  version: '8.2'
+  build: h8fc344f_1
+  build_number: 1
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.2-h8fc344f_1.conda
+  sha256: 4c99f7417419734e3797d45bc355e61c26520e111893b0d7087a01a7fbfbe3dd
+  md5: 105eb1e16bf83bfb2eb380a48032b655
+  depends:
+  - libgcc-ng >=12
+  - ncurses >=6.3,<7.0a0
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 294092
+  timestamp: 1679532238805
+- kind: conda
+  name: readline
+  version: '8.2'
+  build: h92ec313_1
+  build_number: 1
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda
+  sha256: a1dfa679ac3f6007362386576a704ad2d0d7a02e98f5d0b115f207a2da63e884
+  md5: 8cbb776a2f641b943d413b3e19df71f4
+  depends:
+  - ncurses >=6.3,<7.0a0
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 250351
+  timestamp: 1679532511311
+- kind: conda
+  name: readline
+  version: '8.2'
+  build: h9e318b2_1
+  build_number: 1
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda
+  sha256: 41e7d30a097d9b060037f0c6a2b1d4c4ae7e942c06c943d23f9d481548478568
+  md5: f17f77f2acf4d344734bda76829ce14e
+  depends:
+  - ncurses >=6.3,<7.0a0
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 255870
+  timestamp: 1679532707590
+- kind: conda
+  name: ruff
+  version: 0.3.7
+  build: py310h298983d_0
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/ruff-0.3.7-py310h298983d_0.conda
+  sha256: fd0766092fb869c3c591da324cb8e675e9cd01e220dc72b9f2b857e92337c01d
+  md5: 9a5e1425ea8eac4f79a275c20d859cac
+  depends:
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  - ucrt >=10.0.20348.0
+  - vc >=14.2,<15
+  - vc14_runtime >=14.29.30139
+  license: MIT
+  license_family: MIT
+  size: 6353965
+  timestamp: 1712963138812
+- kind: conda
+  name: ruff
+  version: 0.3.7
+  build: py310h3d77a66_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/ruff-0.3.7-py310h3d77a66_0.conda
+  sha256: fc66dae77831b8110cd20ee257b462b4471204d310fc438d9760b769799969d9
+  md5: 55b40e33fae0b983b48d2f7aff8a8978
+  depends:
+  - libgcc-ng >=12
+  - libstdcxx-ng >=12
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  license: MIT
+  license_family: MIT
+  size: 6402959
+  timestamp: 1712962093948
+- kind: conda
+  name: ruff
+  version: 0.3.7
+  build: py310h81561d7_0
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/ruff-0.3.7-py310h81561d7_0.conda
+  sha256: b7c88aae99ef73ffac7886e4dbb02428230b66284fa360134985bb1741f8d569
+  md5: 4d922de656cdef20fb9fd47f0529bb95
+  depends:
+  - libcxx >=16
+  - python >=3.10,<3.11.0a0
+  - python >=3.10,<3.11.0a0 *_cpython
+  - python_abi 3.10.* *_cp310
+  constrains:
+  - __osx >=11.0
+  license: MIT
+  license_family: MIT
+  size: 5882969
+  timestamp: 1712963596623
+- kind: conda
+  name: ruff
+  version: 0.3.7
+  build: py310hdac29b7_0
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/ruff-0.3.7-py310hdac29b7_0.conda
+  sha256: 1d279579eb3282973102f3c903ef21b3b033a8635d00d08ff40959871baf91dc
+  md5: ab5f005a86062035d3eaf1adb081cc26
+  depends:
+  - libcxx >=16
+  - python >=3.10,<3.11.0a0
+  - python_abi 3.10.* *_cp310
+  constrains:
+  - __osx >=10.12
+  license: MIT
+  license_family: MIT
+  size: 6192290
+  timestamp: 1712963577697
+- kind: conda
+  name: ruff
+  version: 0.3.7
+  build: py310hf6424b7_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/ruff-0.3.7-py310hf6424b7_0.conda
+  sha256: 45cf4cfcaf97598959077d93e698db15aa17a7e215c5b050738367def0227e02
+  md5: c6dcc788fee5fefe50165e9c0353d065
+  depends:
+  - libgcc-ng >=12
+  - libstdcxx-ng >=12
+  - python >=3.10,<3.11.0a0
+  - python >=3.10,<3.11.0a0 *_cpython
+  - python_abi 3.10.* *_cp310
+  license: MIT
+  license_family: MIT
+  size: 6047252
+  timestamp: 1712962189494
+- kind: conda
+  name: setuptools
+  version: 70.1.1
+  build: pyhd8ed1ab_0
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda
+  sha256: 34ecbc63df6052a320838335a0e594b60050c92de79254045e52095bc27dde03
+  md5: 985e9e86e1b0fc75a74a9bfab9309ef7
+  depends:
+  - python >=3.8
+  license: MIT
+  license_family: MIT
+  size: 496940
+  timestamp: 1719325175003
+- kind: conda
+  name: tk
+  version: 8.6.13
+  build: h194ca79_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-h194ca79_0.conda
+  sha256: 7fa27cc512d3a783f38bd16bbbffc008807372499d5b65d089a8e43bde9db267
+  md5: f75105e0585851f818e0009dd1dde4dc
+  depends:
+  - libgcc-ng >=12
+  - libzlib >=1.2.13,<1.3.0a0
+  license: TCL
+  license_family: BSD
+  size: 3351802
+  timestamp: 1695506242997
+- kind: conda
+  name: tk
+  version: 8.6.13
+  build: h1abcd95_1
+  build_number: 1
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda
+  sha256: 30412b2e9de4ff82d8c2a7e5d06a15f4f4fef1809a72138b6ccb53a33b26faf5
+  md5: bf830ba5afc507c6232d4ef0fb1a882d
+  depends:
+  - libzlib >=1.2.13,<1.3.0a0
+  license: TCL
+  license_family: BSD
+  size: 3270220
+  timestamp: 1699202389792
+- kind: conda
+  name: tk
+  version: 8.6.13
+  build: h5083fa2_1
+  build_number: 1
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda
+  sha256: 72457ad031b4c048e5891f3f6cb27a53cb479db68a52d965f796910e71a403a8
+  md5: b50a57ba89c32b62428b71a875291c9b
+  depends:
+  - libzlib >=1.2.13,<1.3.0a0
+  license: TCL
+  license_family: BSD
+  size: 3145523
+  timestamp: 1699202432999
+- kind: conda
+  name: tk
+  version: 8.6.13
+  build: h5226925_1
+  build_number: 1
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda
+  sha256: 2c4e914f521ccb2718946645108c9bd3fc3216ba69aea20c2c3cedbd8db32bb1
+  md5: fc048363eb8f03cd1737600a5d08aafe
+  depends:
+  - ucrt >=10.0.20348.0
+  - vc >=14.2,<15
+  - vc14_runtime >=14.29.30139
+  license: TCL
+  license_family: BSD
+  size: 3503410
+  timestamp: 1699202577803
+- kind: conda
+  name: tk
+  version: 8.6.13
+  build: noxft_h4845f30_101
+  build_number: 101
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda
+  sha256: e0569c9caa68bf476bead1bed3d79650bb080b532c64a4af7d8ca286c08dea4e
+  md5: d453b98d9c83e71da0741bb0ff4d76bc
+  depends:
+  - libgcc-ng >=12
+  - libzlib >=1.2.13,<1.3.0a0
+  license: TCL
+  license_family: BSD
+  size: 3318875
+  timestamp: 1699202167581
+- kind: conda
+  name: tomli
+  version: 2.0.1
+  build: pyhd8ed1ab_0
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2
+  sha256: 4cd48aba7cd026d17e86886af48d0d2ebc67ed36f87f6534f4b67138f5a5a58f
+  md5: 5844808ffab9ebdb694585b50ba02a96
+  depends:
+  - python >=3.7
+  license: MIT
+  license_family: MIT
+  size: 15940
+  timestamp: 1644342331069
+- kind: conda
+  name: types-requests
+  version: 2.31.0.20240406
+  build: pyhd8ed1ab_0
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/types-requests-2.31.0.20240406-pyhd8ed1ab_0.conda
+  sha256: de93470fe64b2baa5f8ef16a6edf849bb93542f301ed343d0ab4d6fd6116d742
+  md5: b4bc9b6dbc54191100b518a18be6045e
+  depends:
+  - python >=3.6
+  - urllib3 >=2
+  license: Apache-2.0 AND MIT
+  size: 26072
+  timestamp: 1712378106245
+- kind: conda
+  name: typing_extensions
+  version: 4.11.0
+  build: pyha770c72_0
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.11.0-pyha770c72_0.conda
+  sha256: a7e8714d14f854058e971a6ed44f18cc37cc685f98ddefb2e6b7899a0cc4d1a2
+  md5: 6ef2fc37559256cf682d8b3375e89b80
+  depends:
+  - python >=3.8
+  license: PSF-2.0
+  license_family: PSF
+  size: 37583
+  timestamp: 1712330089194
+- kind: conda
+  name: typos
+  version: 1.20.9
+  build: h11a7dfb_0
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/typos-1.20.9-h11a7dfb_0.conda
+  sha256: c6345dad41706c3f925a3c76308e2af390ed3ce8eb92b84ebb82a37f2f599d1e
+  md5: 9e8f7d03be6ae3eb50f14fddffc7fe51
+  constrains:
+  - __osx >=10.12
+  license: MIT
+  license_family: MIT
+  size: 3347431
+  timestamp: 1713321633495
+- kind: conda
+  name: typos
+  version: 1.20.9
+  build: h1d8f897_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/typos-1.20.9-h1d8f897_0.conda
+  sha256: d98e8e5fb65b4d09cff752534418acf3e835f702003636ef2f2165039ddbb521
+  md5: 519bac2b977d194cc65f32c72bea5d12
+  depends:
+  - libgcc-ng >=12
+  license: MIT
+  license_family: MIT
+  size: 3561452
+  timestamp: 1713321791172
+- kind: conda
+  name: typos
+  version: 1.20.9
+  build: h5ef7bb8_0
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/typos-1.20.9-h5ef7bb8_0.conda
+  sha256: 42e47f16457a6193658e3052c2dbabbdcb5759f79c8f83eb06eb3af91287c18e
+  md5: f22e8b9559961907af67f4ec6d612456
+  constrains:
+  - __osx >=11.0
+  license: MIT
+  license_family: MIT
+  size: 3331993
+  timestamp: 1713321682918
+- kind: conda
+  name: typos
+  version: 1.20.9
+  build: h7f3b576_0
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/typos-1.20.9-h7f3b576_0.conda
+  sha256: ff9db9be0d78163b9b26ab6169e84d6026c05552a51bb335bbe62e0f4680cfd9
+  md5: 9e6937e2784adfe9088d8ee3ba944a07
+  depends:
+  - m2w64-gcc-libs
+  - m2w64-gcc-libs-core
+  license: MIT
+  license_family: MIT
+  size: 2596487
+  timestamp: 1713321776083
+- kind: conda
+  name: typos
+  version: 1.20.9
+  build: he8a937b_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/typos-1.20.9-he8a937b_0.conda
+  sha256: 2fc3888119f17b675edca08e1927f7f3445ec8c5ceb7841fc5dd6f20d8373fdd
+  md5: fc85f19eb1191bb7e6bf34d4b2ec39e6
+  depends:
+  - libgcc-ng >=12
+  license: MIT
+  license_family: MIT
+  size: 3728370
+  timestamp: 1713320476655
+- kind: conda
+  name: tzdata
+  version: 2024a
+  build: h0c530f3_0
+  subdir: noarch
+  noarch: generic
+  url: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda
+  sha256: 7b2b69c54ec62a243eb6fba2391b5e443421608c3ae5dbff938ad33ca8db5122
+  md5: 161081fc7cec0bfda0d86d7cb595f8d8
+  license: LicenseRef-Public-Domain
+  size: 119815
+  timestamp: 1706886945727
+- kind: conda
+  name: ucrt
+  version: 10.0.22621.0
+  build: h57928b3_0
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_0.tar.bz2
+  sha256: f29cdaf8712008f6b419b8b1a403923b00ab2504bfe0fb2ba8eb60e72d4f14c6
+  md5: 72608f6cd3e5898229c3ea16deb1ac43
+  constrains:
+  - vs2015_runtime >=14.29.30037
+  license: LicenseRef-Proprietary
+  license_family: PROPRIETARY
+  size: 1283972
+  timestamp: 1666630199266
+- kind: conda
+  name: urllib3
+  version: 2.2.1
+  build: pyhd8ed1ab_0
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.2.1-pyhd8ed1ab_0.conda
+  sha256: d4009dcc9327684d6409706ce17656afbeae690d8522d3c9bc4df57649a352cd
+  md5: 08807a87fa7af10754d46f63b368e016
+  depends:
+  - brotli-python >=1.0.9
+  - pysocks >=1.5.6,<2.0,!=1.5.7
+  - python >=3.7
+  license: MIT
+  license_family: MIT
+  size: 94669
+  timestamp: 1708239595549
+- kind: conda
+  name: vc
+  version: '14.3'
+  build: hcf57466_18
+  build_number: 18
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-hcf57466_18.conda
+  sha256: 447a8d8292a7b2107dcc18afb67f046824711a652725fc0f522c368e7a7b8318
+  md5: 20e1e652a4c740fa719002a8449994a2
+  depends:
+  - vc14_runtime >=14.38.33130
+  track_features:
+  - vc14
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 16977
+  timestamp: 1702511255313
+- kind: conda
+  name: vc14_runtime
+  version: 14.38.33130
+  build: h82b7239_18
+  build_number: 18
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.38.33130-h82b7239_18.conda
+  sha256: bf94c9af4b2e9cba88207001197e695934eadc96a5c5e4cd7597e950aae3d8ff
+  md5: 8be79fdd2725ddf7bbf8a27a4c1f79ba
+  depends:
+  - ucrt >=10.0.20348.0
+  constrains:
+  - vs2015_runtime 14.38.33130.* *_18
+  license: LicenseRef-ProprietaryMicrosoft
+  license_family: Proprietary
+  size: 749868
+  timestamp: 1702511239004
+- kind: conda
+  name: vs2015_runtime
+  version: 14.38.33130
+  build: hcb4865c_18
+  build_number: 18
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.38.33130-hcb4865c_18.conda
+  sha256: a2fec221f361d6263c117f4ea6d772b21c90a2f8edc6f3eb0eadec6bfe8843db
+  md5: 10d42885e3ed84e575b454db30f1aa93
+  depends:
+  - vc14_runtime >=14.38.33130
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 16988
+  timestamp: 1702511261442
+- kind: conda
+  name: wheel
+  version: 0.43.0
+  build: pyhd8ed1ab_1
+  build_number: 1
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda
+  sha256: cb318f066afd6fd64619f14c030569faf3f53e6f50abf743b4c865e7d95b96bc
+  md5: 0b5293a157c2b5cd513dd1b03d8d3aae
+  depends:
+  - python >=3.8
+  license: MIT
+  license_family: MIT
+  size: 57963
+  timestamp: 1711546009410
+- kind: conda
+  name: win_inet_pton
+  version: 1.1.0
+  build: pyhd8ed1ab_6
+  build_number: 6
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/win_inet_pton-1.1.0-pyhd8ed1ab_6.tar.bz2
+  sha256: a11ae693a0645bf6c7b8a47bac030be9c0967d0b1924537b9ff7458e832c0511
+  md5: 30878ecc4bd36e8deeea1e3c151b2e0b
+  depends:
+  - __win
+  - python >=3.6
+  license: PUBLIC-DOMAIN
+  size: 8191
+  timestamp: 1667051294134
+- kind: conda
+  name: xz
+  version: 5.2.6
+  build: h166bdaf_0
+  subdir: linux-64
+  url: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2
+  sha256: 03a6d28ded42af8a347345f82f3eebdd6807a08526d47899a42d62d319609162
+  md5: 2161070d867d1b1204ea749c8eec4ef0
+  depends:
+  - libgcc-ng >=12
+  license: LGPL-2.1 and GPL-2.0
+  size: 418368
+  timestamp: 1660346797927
+- kind: conda
+  name: xz
+  version: 5.2.6
+  build: h57fd34a_0
+  subdir: osx-arm64
+  url: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2
+  sha256: 59d78af0c3e071021cfe82dc40134c19dab8cdf804324b62940f5c8cd71803ec
+  md5: 39c6b54e94014701dd157f4f576ed211
+  license: LGPL-2.1 and GPL-2.0
+  size: 235693
+  timestamp: 1660346961024
+- kind: conda
+  name: xz
+  version: 5.2.6
+  build: h775f41a_0
+  subdir: osx-64
+  url: https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2
+  sha256: eb09823f34cc2dd663c0ec4ab13f246f45dcd52e5b8c47b9864361de5204a1c8
+  md5: a72f9d4ea13d55d745ff1ed594747f10
+  license: LGPL-2.1 and GPL-2.0
+  size: 238119
+  timestamp: 1660346964847
+- kind: conda
+  name: xz
+  version: 5.2.6
+  build: h8d14728_0
+  subdir: win-64
+  url: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2
+  sha256: 54d9778f75a02723784dc63aff4126ff6e6749ba21d11a6d03c1f4775f269fe0
+  md5: 515d77642eaa3639413c6b1bc3f94219
+  depends:
+  - vc >=14.1,<15
+  - vs2015_runtime >=14.16.27033
+  license: LGPL-2.1 and GPL-2.0
+  size: 217804
+  timestamp: 1660346976440
+- kind: conda
+  name: xz
+  version: 5.2.6
+  build: h9cdd2b7_0
+  subdir: linux-aarch64
+  url: https://conda.anaconda.org/conda-forge/linux-aarch64/xz-5.2.6-h9cdd2b7_0.tar.bz2
+  sha256: 93f58a7b393adf41fa007ac8c55978765e957e90cd31877ece1e5a343cb98220
+  md5: 83baad393a31d59c20b63ba4da6592df
+  depends:
+  - libgcc-ng >=12
+  license: LGPL-2.1 and GPL-2.0
+  size: 440555
+  timestamp: 1660348056328
diff --git a/pixi.toml b/pixi.toml
new file mode 100644
index 0000000000000000000000000000000000000000..4106f0d9b2f653fe2532d7472eb8c8849db3dbd3
--- /dev/null
+++ b/pixi.toml
@@ -0,0 +1,57 @@
+# Pixi is a package management tool for developers.
+# Before running a task, pixi ensures that all listed dependencies are installed first.echop
+#
+# Pixi is not required for rerun, but it is a convenient way to install the
+# dependencies required for this example.
+#
+# https://prefix.dev/docs/pixi/overview
+#
+# Use `pixi task list` to list the available tasks,
+# and `pixi run TASK` to run it (e.g. `pixi run example`).
+
+[project]
+name = "rerun_vista_example"
+authors = ["rerun.io "]
+channels = ["conda-forge"]
+description = "Visualizing the Vista model with Rerun."
+homepage = "https://rerun.io"
+license = "MIT OR Apache-2.0"
+
+platforms = ["linux-64", "linux-aarch64", "osx-arm64", "osx-64", "win-64"]
+readme = "README.md"
+repository = "https://github.com/rerun-io/hf-example-vista"
+version = "0.1.0"
+
+
+[tasks]
+# ------------------------------------------------------------------------------------------
+# Python stuff:
+
+# Run first ruff fix, then ruff format, order is important see also https://twitter.com/charliermarsh/status/1717229721954799727
+py-fmt = "ruff check --fix --config pyproject.toml . && ruff format --config pyproject.toml ."
+py-fmt-check = "ruff check --config pyproject.toml . && ruff format --check --config pyproject.toml"
+py-lint = "mypy --install-types --non-interactive --no-warn-unused-ignore"
+
+# ------------------------------------------------------------------------------------------
+# General stuff:
+lint-typos = "typos"
+
+# ------------------------------------------------------------------------------------------
+install-dependencies = "pip install -r requirements.txt"
+
+[tasks.example]
+cmd = "python main.py"
+depends_on = ["install-dependencies"]
+
+
+[dependencies]
+# Python stuff:
+mypy = "1.8.0"
+ruff = "0.3.7"
+python = "3.10.*"
+pip = ">=24.0,<25" # to install dependencies from requirements.txt
+
+types-requests = ">=2.31,<3" # mypy type hint stubs for generate_changelog.py
+
+# General stuff:
+typos = ">=1.16.20"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..a5460a713149209942d093db1678910423e3a4aa
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,72 @@
+# Copied from https://github.com/rerun-io/rerun_template
+
+[tool.ruff]
+# https://beta.ruff.rs/docs/configuration/
+
+target-version = "py38"
+
+# Enable unsafe fixes to allow ruff to apply fixes that may change the behavior of the code.
+# This is needed because otherwise ruff will not be able to trim whitespaces in (codegened) docstrings.
+unsafe-fixes = true
+
+# Allow preview lints to be enabled (like `PLW1514` to force `encoding` on open).
+preview = true
+# But we only want to opt-in to certain preview rules!
+lint.explicit-preview-rules = true
+
+extend-exclude = [
+    # Automatically generated test artifacts
+    "venv/",
+    "target/",
+]
+
+lint.ignore = [
+    # These makes sense to ignore in example code, but for a proper library we should not ignore these.
+    "D100", # Missing docstring in public module
+    "D101", # Missing docstring in public class
+    "D103", # Missing docstring in public function
+
+    # No blank lines allowed after function docstring.
+    "D202",
+
+    # npydocstyle: http://www.pydocstyle.org/en/stable/error_codes.html
+    # numpy convention with a few additional lints
+    "D107",
+    "D203",
+    "D212",
+    "D401",
+    "D402",
+    "D415",
+    "D416",
+
+    # Ruff can't fix this error on its own (yet)
+    # Having ruff check this causes errors that prevent the code-formatting process from completing.
+    "E501",
+
+    # allow relative imports
+    "TID252",
+
+    "UP007", # We need this, or `ruff format` will convert `Union[X, Y]` to `X | Y` which break on Python 3.8
+]
+
+line-length = 120
+lint.select = [
+    "D",       # pydocstyle codes https://www.pydocstyle.org/en/latest/error_codes.html
+    "E",       # pycodestyle error codes: https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
+    "F",       # Flake8 error codes https://flake8.pycqa.org/en/latest/user/error-codes.html
+    "I",       # Isort
+    "TID",     # flake8-tidy-imports
+    "W",       # pycodestyle warning codes: https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
+    "UP",      # pyupgrade (ensures idomatic code for supported python version)
+    "PLW1514", # Force setting `encoding` for open calls. This is in order to prevent issues when opening utf8 files on windows where the default encoding may depend on the active locale. https://docs.astral.sh/ruff/rules/unspecified-encoding/
+]
+
+lint.unfixable = [
+    "PLW1514", # Automatic fix for `encoding` doesn't do what we want - it queries the locale for the preferred encoding which is exactly what we want to avoid.
+]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401", "F403"]
+
+[tool.ruff.lint.isort]
+required-imports = ["from __future__ import annotations"]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6bec263ada6a3181e2a3c79273308f9d57dca53c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,168 @@
+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+async-timeout==4.0.3
+attrs==23.2.0
+black==23.7.0
+blinker==1.8.2
+braceexpand==0.1.7
+cachetools==5.3.3
+certifi==2024.6.2
+chardet==5.1.0
+charset-normalizer==3.3.2
+click==8.1.7
+clip @ git+https://github.com/openai/CLIP.git
+cmake==3.29.5.1
+contourpy==1.2.1
+cycler==0.12.1
+deepspeed
+dnspython==2.6.1
+docker-pycreds==0.4.0
+einops==0.8.0
+email_validator==2.1.1
+exceptiongroup==1.2.1
+fairscale==0.4.13
+fastapi==0.111.0
+fastapi-cli==0.0.4
+ffmpy==0.3.2
+filelock==3.15.1
+fire==0.6.0
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.6.0
+ftfy==6.2.0
+gitdb==4.0.11
+GitPython==3.1.43
+gradio==4.26.0
+gradio_client==0.15.1
+gradio_rerun==0.0.3
+h11==0.14.0
+hjson==3.1.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.3
+idna==3.7
+imageio==2.31.1
+imageio-ffmpeg==0.4.8
+importlib_resources==6.4.0
+invisible-watermark==0.2.0
+jedi==0.19.1
+Jinja2==3.1.4
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+kornia==0.7.2
+kornia_rs==0.1.3
+lightning-utilities==0.11.2
+lit==18.1.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+natsort==8.4.0
+networkx==3.3
+ninja==1.11.1.1
+numpy==1.26.4
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-ml-py==12.555.43
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+omegaconf==2.3.0
+open-clip-torch==2.24.0
+opencv-python==4.6.0.66
+orjson==3.10.4
+packaging==24.1
+pandas==2.2.2
+parso==0.8.4
+pathspec==0.12.1
+pillow==10.3.0
+platformdirs==4.2.2
+protobuf==3.20.3
+psutil==5.9.8
+pudb==2024.1
+py-cpuinfo==9.0.0
+pyarrow==16.1.0
+pydantic==2.7.4
+pydantic_core==2.18.4
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytorch-lightning==2.0.1
+pytz==2024.1
+PyWavelets==1.6.0
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+rerun-sdk==0.16.1
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.4.8
+safetensors==0.4.3
+scipy==1.13.1
+semantic-version==2.10.0
+sentencepiece==0.2.0
+sentry-sdk==2.5.1
+setproctitle==1.3.3
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+spaces==0.28.3
+starlette==0.37.2
+streamlit==1.35.0
+sympy==1.12.1
+tensorboardX==2.6
+termcolor==2.4.0
+timm==1.0.3
+tokenizers==0.12.1
+toml==0.10.2
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.0.1
+torchaudio==2.0.2
+torchdata==0.6.1
+torchmetrics==1.4.0.post0
+torchvision==0.15.2
+tornado==6.4.1
+tqdm==4.66.4
+transformers==4.19.1
+triton==2.0.0
+typer==0.12.3
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+urllib3==1.26.18
+urwid==2.6.14
+urwid_readline==0.14
+uvicorn==0.30.1
+uvloop==0.19.0
+wandb==0.17.1
+watchdog==4.0.1
+watchfiles==0.22.0
+wcwidth==0.2.13
+webdataset==0.2.86
+websockets==11.0.3
+xformers==0.0.22
+yarl==1.9.4
diff --git a/scripts/template_update.py b/scripts/template_update.py
new file mode 100755
index 0000000000000000000000000000000000000000..04710e09ef8f2e83343b119bc398862e112ac040
--- /dev/null
+++ b/scripts/template_update.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+# Copied from https://github.com/rerun-io/rerun_template
+
+"""
+The script has two purposes.
+
+After using `rerun_template` as a template, run this to clean out things you don't need.
+Use `scripts/template_update.py init --languages cpp,rust,python` for this.
+
+Update an existing repository with the latest changes from the template.
+Use `scripts/template_update.py update --languages cpp,rust,python` for this.
+
+In either case, make sure the list of languages matches the languages you want to support.
+You can also use `--dry-run` to see what would happen without actually changing anything.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import shutil
+import tempfile
+
+from git import Repo  # pip install GitPython
+
+OWNER = "rerun-io"
+
+# Don't overwrite these when updating existing repository from the template
+DO_NOT_OVERWRITE = {
+    "Cargo.lock",
+    "CHANGELOG.md",
+    "main.py",
+    "pixi.lock",
+    "README.md",
+    "requirements.txt",
+}
+
+# Files required by C++, but not by _both_ Python and Rust
+CPP_FILES = {
+    ".clang-format",
+    ".github/workflows/cpp.yml",
+    "CMakeLists.txt",
+    "pixi.lock",  # Pixi is only C++ & Python - For Rust we only use cargo
+    "pixi.toml",  # Pixi is only C++ & Python - For Rust we only use cargo
+    "src/",
+    "src/main.cpp",
+}
+
+# Files required by Python, but not by _both_ C++ and Rust
+PYTHON_FILES = {
+    ".github/workflows/python.yml",
+    ".mypy.ini",
+    "main.py",
+    "pixi.lock",  # Pixi is only C++ & Python - For Rust we only use cargo
+    "pixi.toml",  # Pixi is only C++ & Python - For Rust we only use cargo
+    "pyproject.toml",
+    "requirements.txt",
+}
+
+# Files required by Rust, but not by _both_ C++ and Python
+RUST_FILES = {
+    ".github/workflows/rust.yml",
+    "bacon.toml",
+    "Cargo.lock",
+    "Cargo.toml",
+    "CHANGELOG.md",  # We only keep a changelog for Rust crates at the moment
+    "clippy.toml",
+    "Cranky.toml",
+    "deny.toml",
+    "rust-toolchain",
+    "scripts/clippy_wasm/",
+    "scripts/clippy_wasm/clippy.toml",
+    "scripts/generate_changelog.py",  # We only keep a changelog for Rust crates at the moment
+    "src/",
+    "src/lib.rs",
+    "src/main.rs",
+}
+
+# Files we used to have, but have been removed in never version of rerun_template
+DEAD_FILES = ["bacon.toml", "Cranky.toml"]
+
+
+def parse_languages(lang_str: str) -> set[str]:
+    languages = lang_str.split(",") if lang_str else []
+    for lang in languages:
+        assert lang in ["cpp", "python", "rust"], f"Unsupported language: {lang}"
+    return set(languages)
+
+
+def calc_deny_set(languages: set[str]) -> set[str]:
+    """The set of files to delete/ignore."""
+    files_to_delete = CPP_FILES | PYTHON_FILES | RUST_FILES
+    if "cpp" in languages:
+        files_to_delete -= CPP_FILES
+    if "python" in languages:
+        files_to_delete -= PYTHON_FILES
+    if "rust" in languages:
+        files_to_delete -= RUST_FILES
+    return files_to_delete
+
+
+def init(languages: set[str], dry_run: bool) -> None:
+    print("Removing all language-specific files not needed for languages {languages}.")
+    files_to_delete = calc_deny_set(languages)
+    delete_files_and_folder(files_to_delete, dry_run)
+
+
+def remove_file(filepath: str):
+    try:
+        os.remove(filepath)
+    except FileNotFoundError:
+        pass
+
+
+def delete_files_and_folder(paths: set[str], dry_run: bool) -> None:
+    repo_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+    for path in paths:
+        full_path = os.path.join(repo_path, path)
+        if os.path.exists(full_path):
+            if os.path.isfile(full_path):
+                print(f"Removing file {full_path}…")
+                if not dry_run:
+                    remove_file(full_path)
+            elif os.path.isdir(full_path):
+                print(f"Removing folder {full_path}…")
+                if not dry_run:
+                    shutil.rmtree(full_path)
+
+
+def update(languages: set[str], dry_run: bool) -> None:
+    for file in DEAD_FILES:
+        print(f"Removing dead file {file}…")
+        if not dry_run:
+            remove_file(file)
+
+    files_to_ignore = calc_deny_set(languages) | DO_NOT_OVERWRITE
+    repo_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        Repo.clone_from("https://github.com/rerun-io/rerun_template.git", temp_dir)
+        for root, dirs, files in os.walk(temp_dir):
+            for file in files:
+                src_path = os.path.join(root, file)
+                rel_path = os.path.relpath(src_path, temp_dir)
+
+                if rel_path.startswith(".git/"):
+                    continue
+                if rel_path.startswith("src/"):
+                    continue
+                if rel_path in files_to_ignore:
+                    continue
+
+                dest_path = os.path.join(repo_path, rel_path)
+
+                print(f"Updating {rel_path}…")
+                if not dry_run:
+                    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
+                    shutil.copy2(src_path, dest_path)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Handle the Rerun template.")
+    subparsers = parser.add_subparsers(dest="command")
+
+    init_parser = subparsers.add_parser("init", help="Initialize a new checkout of the template.")
+    init_parser.add_argument(
+        "--languages", default="", nargs="?", const="", help="The languages to support (e.g. `cpp,python,rust`)."
+    )
+    init_parser.add_argument("--dry-run", action="store_true", help="Don't actually delete any files.")
+
+    update_parser = subparsers.add_parser(
+        "update", help="Update all existing Rerun repositories with the latest changes from the template"
+    )
+    update_parser.add_argument(
+        "--languages", default="", nargs="?", const="", help="The languages to support (e.g. `cpp,python,rust`)."
+    )
+    update_parser.add_argument("--dry-run", action="store_true", help="Don't actually delete any files.")
+
+    args = parser.parse_args()
+
+    if args.command == "init":
+        init(parse_languages(args.languages), args.dry_run)
+    elif args.command == "update":
+        update(parse_languages(args.languages), args.dry_run)
+    else:
+        parser.print_help()
+        exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/style.css b/style.css
new file mode 100644
index 0000000000000000000000000000000000000000..4606737e9d7b20e348c26c99d6f95f981e959fb6
--- /dev/null
+++ b/style.css
@@ -0,0 +1,4 @@
+gradio-app {
+  max-width: 900px;
+  margin: auto;
+}
diff --git a/vista/.gitignore b/vista/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b10180d1e6fe06da6aacb4199bfb2209c942e389
--- /dev/null
+++ b/vista/.gitignore
@@ -0,0 +1,168 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+outputs/
+logs/
+
+.DS_Store
+*/.DS_Store
diff --git a/vista/LICENSE b/vista/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/vista/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/vista/__init__.py b/vista/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59b2d7965192d67bdb4bd59617e20bc591eb9417
--- /dev/null
+++ b/vista/__init__.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import rerun.blueprint as rrb
+import torch
+from transformers.utils import hub
+
+from . import sample, sample_utils
+
+
+def create_model():
+    return sample_utils.init_model(
+        {
+            "config": "./vista/configs/inference/vista.yaml",
+            "ckpt": hub.get_file_from_repo("OpenDriveLab/Vista", "vista.safetensors"),
+        }
+    )
+
+
+def generate_blueprint(n_rounds: int) -> rrb.Blueprint:
+    row1 = rrb.Horizontal(
+        *[
+            rrb.TensorView(origin=f"diffusion_{i}", name=f"Latents Segment {i+1}")
+            for i in range(n_rounds)
+        ],
+    )
+    row2 = rrb.Spatial2DView(origin="generated_image", name="Generated Video")
+
+    return rrb.Blueprint(rrb.Vertical(row1, row2), collapse_panels=True)
+
+
+def run_sampling(
+    log_queue,
+    first_frame_file_name,
+    height,
+    width,
+    n_rounds,
+    n_frames,
+    n_steps,
+    cfg_scale,
+    cond_aug,
+    model=None,
+) -> None:
+    if model is None:
+        model = create_model()
+
+    unique_keys = set([x.input_key for x in model.conditioner.embedders])
+    value_dict = sample_utils.init_embedder_options(unique_keys)
+
+    action_dict = None
+
+    first_frame = sample.load_img(first_frame_file_name, height, width, "cuda")[None]
+    repeated_frame = first_frame.expand(n_frames, -1, -1, -1)
+
+    value_dict = sample_utils.init_embedder_options(unique_keys)
+    cond_img = first_frame
+    value_dict["cond_frames_without_noise"] = cond_img
+    value_dict["cond_aug"] = cond_aug
+    value_dict["cond_frames"] = cond_img + cond_aug * torch.randn_like(cond_img)
+    if action_dict is not None:
+        for key, value in action_dict.items():
+            value_dict[key] = value
+
+    if n_rounds > 1:
+        guider = "TrianglePredictionGuider"
+    else:
+        guider = "VanillaCFG"
+    sampler = sample_utils.init_sampling(
+        guider=guider,
+        steps=n_steps,
+        cfg_scale=cfg_scale,
+        num_frames=n_frames,
+    )
+
+    uc_keys = [
+        "cond_frames",
+        "cond_frames_without_noise",
+        "command",
+        "trajectory",
+        "speed",
+        "angle",
+        "goal",
+    ]
+
+    _generated_images, _samples_z, _inputs = sample_utils.do_sample(
+        repeated_frame,
+        model,
+        sampler,
+        value_dict,
+        num_rounds=n_rounds,
+        num_frames=n_frames,
+        force_uc_zero_embeddings=uc_keys,
+        initial_cond_indices=[0],
+        log_queue=log_queue,
+    )
+
+    log_queue.put("done")
diff --git a/vista/bin_to_st.py b/vista/bin_to_st.py
new file mode 100644
index 0000000000000000000000000000000000000000..278b132abb00054c90ee2317ba3982fd72d001de
--- /dev/null
+++ b/vista/bin_to_st.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+import os
+
+import torch
+from safetensors.torch import save_file
+
+ckpt = "path_to/pytorch_model.bin"
+
+vista_bin = torch.load(ckpt, map_location="cpu")  # only contains model weights
+
+for k in list(vista_bin.keys()):  # merge LoRA weights (if exist) for inference
+    if "adapter_down" in k:
+        if "q_adapter_down" in k:
+            up_k = k.replace("q_adapter_down", "q_adapter_up")
+            pretrain_k = k.replace("q_adapter_down", "to_q")
+        elif "k_adapter_down" in k:
+            up_k = k.replace("k_adapter_down", "k_adapter_up")
+            pretrain_k = k.replace("k_adapter_down", "to_k")
+        elif "v_adapter_down" in k:
+            up_k = k.replace("v_adapter_down", "v_adapter_up")
+            pretrain_k = k.replace("v_adapter_down", "to_v")
+        else:
+            up_k = k.replace("out_adapter_down", "out_adapter_up")
+            if "model_ema" in k:
+                pretrain_k = k.replace("out_adapter_down", "to_out0")
+            else:
+                pretrain_k = k.replace("out_adapter_down", "to_out.0")
+
+        lora_weights = vista_bin[up_k] @ vista_bin[k]
+        del vista_bin[k]
+        del vista_bin[up_k]
+        vista_bin[pretrain_k] = vista_bin[pretrain_k] + lora_weights
+
+for k in list(vista_bin.keys()):  # remove the prefix
+    if "_forward_module" in k and "decay" not in k and "num_updates" not in k:
+        vista_bin[k.replace("_forward_module.", "")] = vista_bin[k]
+    del vista_bin[k]
+
+for k in list(vista_bin.keys()):  # combine EMA weights
+    if "model_ema" in k:
+        orig_k = None
+        for kk in list(vista_bin.keys()):
+            if "model_ema" not in kk and k[10:] == kk[6:].replace(".", ""):
+                orig_k = kk
+        assert orig_k is not None
+        vista_bin[orig_k] = vista_bin[k]
+        del vista_bin[k]
+        print("Replace", orig_k, "with", k)
+
+vista_st = dict()
+for k in list(vista_bin.keys()):
+    vista_st[k] = vista_bin[k]
+
+os.makedirs("ckpts", exist_ok=True)
+save_file(vista_st, "ckpts/vista.safetensors")
diff --git a/vista/configs/example/nusc_train.yaml b/vista/configs/example/nusc_train.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..077c3a289f9ef7747413617de359c0e8ceab98fa
--- /dev/null
+++ b/vista/configs/example/nusc_train.yaml
@@ -0,0 +1,292 @@
+model:
+  base_learning_rate: 5.e-5
+  target: vista.vwm.models.diffusion.DiffusionEngine
+  params:
+    use_ema: True
+    input_key: img_seq
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    en_and_decode_n_samples_a_time: 1
+    num_frames: &num_frames 25
+    slow_spatial_layers: True
+    train_peft_adapters: False
+    replace_cond_frames: &replace_cond_frames True
+    fixed_cond_frames: # only used for logging images
+      - [ 0, 1, 2 ]
+
+    denoiser_config:
+      target: vista.vwm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        num_frames: *num_frames
+
+        scaling_config:
+          target: vista.vwm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: vista.vwm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [ 3, 1, 1 ]
+        add_lora: False
+        action_control: False
+
+    conditioner_config:
+      target: vista.vwm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - input_key: cond_frames_without_noise
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+            params:
+              n_cond_frames: 1
+              n_copies: 1
+              open_clip_embedding_config:
+                target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+                params:
+                  freeze: True
+
+          - input_key: fps_id
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: motion_bucket_id
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: cond_frames
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+            params:
+              disable_encoder_autocast: True
+              n_cond_frames: 1
+              n_copies: 1
+              is_ae: True
+
+              encoder_config:
+                target: vista.vwm.models.autoencoder.AutoencoderKLModeOnly
+                params:
+                  embed_dim: 4
+                  monitor: val/rec_loss
+
+                  ddconfig:
+                    attn_type: vanilla-xformers
+                    double_z: True
+                    z_channels: 4
+                    resolution: 256
+                    in_channels: 3
+                    out_ch: 3
+                    ch: 128
+                    ch_mult: [ 1, 2, 4, 4 ]
+                    num_res_blocks: 2
+                    attn_resolutions: [ ]
+                    dropout: 0.0
+
+                  loss_config:
+                    target: torch.nn.Identity
+
+          - input_key: cond_aug
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: command
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: &action_emb_dim 128
+              num_features: 1
+              add_sequence_dim: True
+
+          - input_key: trajectory
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 8
+              add_sequence_dim: True
+
+          - input_key: speed
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 4
+              add_sequence_dim: True
+
+          - input_key: angle
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 4
+              add_sequence_dim: True
+
+          - input_key: goal
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 2
+              add_sequence_dim: True
+
+    first_stage_config:
+      target: vista.vwm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+
+        regularizer_config:
+          target: vista.vwm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+
+        encoder_config:
+          target: vista.vwm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+
+        decoder_config:
+          target: vista.vwm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+            video_kernel_size: [ 3, 1, 1 ]
+
+    scheduler_config:
+      target: vista.vwm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 1000 ]
+        cycle_lengths: [ 10000000000000 ]
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    loss_fn_config:
+      target: vista.vwm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        use_additional_loss: True
+        offset_noise_level: 0.02
+        additional_loss_weight: 0.1
+        num_frames: *num_frames
+        replace_cond_frames: *replace_cond_frames
+        cond_frames_choices:
+          - [ ]
+          - [ 0 ]
+          - [ 0, 1 ]
+          - [ 0, 1, 2 ]
+
+        sigma_sampler_config:
+          target: vista.vwm.modules.diffusionmodules.sigma_sampling.EDMSampling
+          params:
+            p_mean: 1.0
+            p_std: 1.6
+            num_frames: *num_frames
+
+        loss_weighting_config:
+          target: vista.vwm.modules.diffusionmodules.loss_weighting.VWeighting
+
+    sampler_config:
+      target: vista.vwm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 15
+
+        discretization_config:
+          target: vista.vwm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: vista.vwm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            num_frames: *num_frames
+            max_scale: 3.0
+            min_scale: 1.5
+
+data:
+  target: vista.vwm.data.dataset.Sampler
+  params:
+    batch_size: 1
+    num_workers: 16
+    subsets:
+      - NuScenes
+    probs:
+      - 1
+    samples_per_epoch: 16000
+    target_height: 576
+    target_width: 1024
+    num_frames: *num_frames
+
+lightning:
+  callbacks:
+    image_logger:
+      target: train.ImageLogger
+      params:
+        num_frames: *num_frames
+        disabled: False
+        enable_autocast: False
+        batch_frequency: 100
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          N: *num_frames
+
+  modelcheckpoint:
+    params:
+      every_n_epochs: 1  # every_n_train_steps: 5000, set the same as image_logger batch_frequency
+
+  trainer:
+    devices: 0,1
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 100
+    strategy: deepspeed_stage_2
+    gradient_clip_val: 0.3
diff --git a/vista/configs/inference/vista.yaml b/vista/configs/inference/vista.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0e0770fac303d1d5bb8af5cab927af0aebfb8ef
--- /dev/null
+++ b/vista/configs/inference/vista.yaml
@@ -0,0 +1,184 @@
+model:
+  target: vista.vwm.models.diffusion.DiffusionEngine
+  params:
+    input_key: img_seq
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    en_and_decode_n_samples_a_time: 1
+    num_frames: &num_frames 25
+
+    denoiser_config:
+      target: vista.vwm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        num_frames: *num_frames
+
+        scaling_config:
+          target: vista.vwm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: vista.vwm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: False
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [ 3, 1, 1 ]
+        add_lora: False
+        action_control: True
+
+    conditioner_config:
+      target: vista.vwm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - input_key: cond_frames_without_noise
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+            params:
+              n_cond_frames: 1
+              n_copies: 1
+              open_clip_embedding_config:
+                target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+                params:
+                  freeze: True
+
+          - input_key: fps_id
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: motion_bucket_id
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: cond_frames
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+            params:
+              disable_encoder_autocast: True
+              n_cond_frames: 1
+              n_copies: 1
+              is_ae: True
+
+              encoder_config:
+                target: vista.vwm.models.autoencoder.AutoencoderKLModeOnly
+                params:
+                  embed_dim: 4
+                  monitor: val/rec_loss
+
+                  ddconfig:
+                    attn_type: vanilla-xformers
+                    double_z: True
+                    z_channels: 4
+                    resolution: 256
+                    in_channels: 3
+                    out_ch: 3
+                    ch: 128
+                    ch_mult: [ 1, 2, 4, 4 ]
+                    num_res_blocks: 2
+                    attn_resolutions: [ ]
+                    dropout: 0.0
+
+                  loss_config:
+                    target: torch.nn.Identity
+
+          - input_key: cond_aug
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: command
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: &action_emb_dim 128
+              num_features: 1
+              add_sequence_dim: True
+
+          - input_key: trajectory
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 8
+              add_sequence_dim: True
+
+          - input_key: speed
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 4
+              add_sequence_dim: True
+
+          - input_key: angle
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 4
+              add_sequence_dim: True
+
+          - input_key: goal
+            is_trainable: False
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 2
+              add_sequence_dim: True
+
+    first_stage_config:
+      target: vista.vwm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+
+        regularizer_config:
+          target: vista.vwm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+
+        encoder_config:
+          target: vista.vwm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+
+        decoder_config:
+          target: vista.vwm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+            video_kernel_size: [ 3, 1, 1 ]
diff --git a/vista/configs/training/vista_phase1.yaml b/vista/configs/training/vista_phase1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2312dd7bf0b401014697c56f8907294e717dfe83
--- /dev/null
+++ b/vista/configs/training/vista_phase1.yaml
@@ -0,0 +1,247 @@
+model:
+  base_learning_rate: 1.e-5
+  target: vista.vwm.models.diffusion.DiffusionEngine
+  params:
+    use_ema: True
+    input_key: img_seq
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    en_and_decode_n_samples_a_time: 1
+    num_frames: &num_frames 25
+    slow_spatial_layers: True
+    train_peft_adapters: False
+    replace_cond_frames: &replace_cond_frames True
+    fixed_cond_frames: # only used for logging images
+      - [ 0, 1, 2 ]
+
+    denoiser_config:
+      target: vista.vwm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        num_frames: *num_frames
+
+        scaling_config:
+          target: vista.vwm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: vista.vwm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [ 3, 1, 1 ]
+        add_lora: False
+        action_control: False
+
+    conditioner_config:
+      target: vista.vwm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - input_key: cond_frames_without_noise
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+            params:
+              n_cond_frames: 1
+              n_copies: 1
+              open_clip_embedding_config:
+                target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+                params:
+                  freeze: True
+
+          - input_key: fps_id
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: motion_bucket_id
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: cond_frames
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+            params:
+              disable_encoder_autocast: True
+              n_cond_frames: 1
+              n_copies: 1
+              is_ae: True
+
+              encoder_config:
+                target: vista.vwm.models.autoencoder.AutoencoderKLModeOnly
+                params:
+                  embed_dim: 4
+                  monitor: val/rec_loss
+
+                  ddconfig:
+                    attn_type: vanilla-xformers
+                    double_z: True
+                    z_channels: 4
+                    resolution: 256
+                    in_channels: 3
+                    out_ch: 3
+                    ch: 128
+                    ch_mult: [ 1, 2, 4, 4 ]
+                    num_res_blocks: 2
+                    attn_resolutions: [ ]
+                    dropout: 0.0
+
+                  loss_config:
+                    target: torch.nn.Identity
+
+          - input_key: cond_aug
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+    first_stage_config:
+      target: vista.vwm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+
+        regularizer_config:
+          target: vista.vwm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+
+        encoder_config:
+          target: vista.vwm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+
+        decoder_config:
+          target: vista.vwm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+            video_kernel_size: [ 3, 1, 1 ]
+
+    scheduler_config:
+      target: vista.vwm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 1000 ]
+        cycle_lengths: [ 10000000000000 ]
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    loss_fn_config:
+      target: vista.vwm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        use_additional_loss: True
+        offset_noise_level: 0.02
+        additional_loss_weight: 0.1
+        num_frames: *num_frames
+        replace_cond_frames: *replace_cond_frames
+        cond_frames_choices:
+          - [ ]
+          - [ 0 ]
+          - [ 0, 1 ]
+          - [ 0, 1, 2 ]
+
+        sigma_sampler_config:
+          target: vista.vwm.modules.diffusionmodules.sigma_sampling.EDMSampling
+          params:
+            p_mean: 1.0
+            p_std: 1.6
+            num_frames: *num_frames
+
+        loss_weighting_config:
+          target: vista.vwm.modules.diffusionmodules.loss_weighting.VWeighting
+
+    sampler_config:
+      target: vista.vwm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 15
+
+        discretization_config:
+          target: vista.vwm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: vista.vwm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            num_frames: *num_frames
+            max_scale: 3.0
+            min_scale: 1.5
+
+data:
+  target: vista.vwm.data.dataset.Sampler
+  params:
+    batch_size: 1
+    num_workers: 16
+    subsets:
+      - YouTube
+    probs:
+      - 1
+    samples_per_epoch: 256000
+    target_height: 576
+    target_width: 1024
+    num_frames: *num_frames
+
+lightning:
+  callbacks:
+    image_logger:
+      target: train.ImageLogger
+      params:
+        num_frames: *num_frames
+        disabled: False
+        enable_autocast: False
+        batch_frequency: 1000
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          N: *num_frames
+
+  modelcheckpoint:
+    params:
+      every_n_epochs: 1  # every_n_train_steps: 5000, set the same as image_logger batch_frequency
+
+  trainer:
+    devices: 0,1
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
+    max_epochs: 100
+    strategy: deepspeed_stage_2
+    gradient_clip_val: 0.3
diff --git a/vista/configs/training/vista_phase2_stage1.yaml b/vista/configs/training/vista_phase2_stage1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..771885165b260bd281718b74d24c9e7211748d27
--- /dev/null
+++ b/vista/configs/training/vista_phase2_stage1.yaml
@@ -0,0 +1,294 @@
+model:
+  base_learning_rate: 5.e-5
+  target: vista.vwm.models.diffusion.DiffusionEngine
+  params:
+    use_ema: True
+    input_key: img_seq
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    en_and_decode_n_samples_a_time: 1
+    num_frames: &num_frames 25
+    slow_spatial_layers: False
+    train_peft_adapters: True
+    replace_cond_frames: &replace_cond_frames True
+    fixed_cond_frames: # only used for logging images
+      - [ 0 ]
+
+    denoiser_config:
+      target: vista.vwm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        num_frames: *num_frames
+
+        scaling_config:
+          target: vista.vwm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: vista.vwm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [ 3, 1, 1 ]
+        add_lora: True
+        action_control: True
+
+    conditioner_config:
+      target: vista.vwm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - input_key: cond_frames_without_noise
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+            params:
+              n_cond_frames: 1
+              n_copies: 1
+              open_clip_embedding_config:
+                target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+                params:
+                  freeze: True
+
+          - input_key: fps_id
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: motion_bucket_id
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: cond_frames
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+            params:
+              disable_encoder_autocast: True
+              n_cond_frames: 1
+              n_copies: 1
+              is_ae: True
+
+              encoder_config:
+                target: vista.vwm.models.autoencoder.AutoencoderKLModeOnly
+                params:
+                  embed_dim: 4
+                  monitor: val/rec_loss
+
+                  ddconfig:
+                    attn_type: vanilla-xformers
+                    double_z: True
+                    z_channels: 4
+                    resolution: 256
+                    in_channels: 3
+                    out_ch: 3
+                    ch: 128
+                    ch_mult: [ 1, 2, 4, 4 ]
+                    num_res_blocks: 2
+                    attn_resolutions: [ ]
+                    dropout: 0.0
+
+                  loss_config:
+                    target: torch.nn.Identity
+
+          - input_key: cond_aug
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: command
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: &action_emb_dim 128
+              num_features: 1
+              add_sequence_dim: True
+
+          - input_key: trajectory
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 8
+              add_sequence_dim: True
+
+          - input_key: speed
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 4
+              add_sequence_dim: True
+
+          - input_key: angle
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 4
+              add_sequence_dim: True
+
+          - input_key: goal
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 2
+              add_sequence_dim: True
+
+    first_stage_config:
+      target: vista.vwm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+
+        regularizer_config:
+          target: vista.vwm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+
+        encoder_config:
+          target: vista.vwm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+
+        decoder_config:
+          target: vista.vwm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+            video_kernel_size: [ 3, 1, 1 ]
+
+    scheduler_config:
+      target: vista.vwm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 1000 ]
+        cycle_lengths: [ 10000000000000 ]
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    loss_fn_config:
+      target: vista.vwm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        use_additional_loss: True
+        offset_noise_level: 0.02
+        additional_loss_weight: 0.1
+        num_frames: *num_frames
+        replace_cond_frames: *replace_cond_frames
+        cond_frames_choices:
+          - [ ]
+          - [ 0 ]
+          - [ 0, 1 ]
+          - [ 0, 1, 2 ]
+
+        sigma_sampler_config:
+          target: vista.vwm.modules.diffusionmodules.sigma_sampling.EDMSampling
+          params:
+            p_mean: 1.0
+            p_std: 1.6
+            num_frames: *num_frames
+
+        loss_weighting_config:
+          target: vista.vwm.modules.diffusionmodules.loss_weighting.VWeighting
+
+    sampler_config:
+      target: vista.vwm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 15
+
+        discretization_config:
+          target: vista.vwm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: vista.vwm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            num_frames: *num_frames
+            max_scale: 3.0
+            min_scale: 1.5
+
+data:
+  target: vista.vwm.data.dataset.Sampler
+  params:
+    batch_size: 1
+    num_workers: 16
+    subsets:
+      - YouTube
+      - NuScenes
+    probs:
+      - 1
+      - 1
+    samples_per_epoch: 256000
+    target_height: 320
+    target_width: 576
+    num_frames: *num_frames
+
+lightning:
+  callbacks:
+    image_logger:
+      target: train.ImageLogger
+      params:
+        num_frames: *num_frames
+        disabled: False
+        enable_autocast: False
+        batch_frequency: 1000
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          N: *num_frames
+
+  modelcheckpoint:
+    params:
+      every_n_epochs: 1  # every_n_train_steps: 5000, set the same as image_logger batch_frequency
+
+  trainer:
+    devices: 0,1
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 100
+    strategy: deepspeed_stage_2
+    gradient_clip_val: 0.3
diff --git a/vista/configs/training/vista_phase2_stage2.yaml b/vista/configs/training/vista_phase2_stage2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bdbe2c6946f12512c13bcc0148de1d137ea0866b
--- /dev/null
+++ b/vista/configs/training/vista_phase2_stage2.yaml
@@ -0,0 +1,294 @@
+model:
+  base_learning_rate: 5.e-5
+  target: vista.vwm.models.diffusion.DiffusionEngine
+  params:
+    use_ema: True
+    input_key: img_seq
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    en_and_decode_n_samples_a_time: 1
+    num_frames: &num_frames 25
+    slow_spatial_layers: False
+    train_peft_adapters: True
+    replace_cond_frames: &replace_cond_frames True
+    fixed_cond_frames: # only used for logging images
+      - [ 0 ]
+
+    denoiser_config:
+      target: vista.vwm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        num_frames: *num_frames
+
+        scaling_config:
+          target: vista.vwm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: vista.vwm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [ 3, 1, 1 ]
+        add_lora: True
+        action_control: True
+
+    conditioner_config:
+      target: vista.vwm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - input_key: cond_frames_without_noise
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+            params:
+              n_cond_frames: 1
+              n_copies: 1
+              open_clip_embedding_config:
+                target: vista.vwm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+                params:
+                  freeze: True
+
+          - input_key: fps_id
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: motion_bucket_id
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: cond_frames
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+            params:
+              disable_encoder_autocast: True
+              n_cond_frames: 1
+              n_copies: 1
+              is_ae: True
+
+              encoder_config:
+                target: vista.vwm.models.autoencoder.AutoencoderKLModeOnly
+                params:
+                  embed_dim: 4
+                  monitor: val/rec_loss
+
+                  ddconfig:
+                    attn_type: vanilla-xformers
+                    double_z: True
+                    z_channels: 4
+                    resolution: 256
+                    in_channels: 3
+                    out_ch: 3
+                    ch: 128
+                    ch_mult: [ 1, 2, 4, 4 ]
+                    num_res_blocks: 2
+                    attn_resolutions: [ ]
+                    dropout: 0.0
+
+                  loss_config:
+                    target: torch.nn.Identity
+
+          - input_key: cond_aug
+            is_trainable: False
+            ucg_rate: 0.0
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - input_key: command
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: &action_emb_dim 128
+              num_features: 1
+              add_sequence_dim: True
+
+          - input_key: trajectory
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 8
+              add_sequence_dim: True
+
+          - input_key: speed
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 4
+              add_sequence_dim: True
+
+          - input_key: angle
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 4
+              add_sequence_dim: True
+
+          - input_key: goal
+            is_trainable: False
+            ucg_rate: 0.15
+            target: vista.vwm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: *action_emb_dim
+              num_features: 2
+              add_sequence_dim: True
+
+    first_stage_config:
+      target: vista.vwm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+
+        regularizer_config:
+          target: vista.vwm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+
+        encoder_config:
+          target: vista.vwm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+
+        decoder_config:
+          target: vista.vwm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [ 1, 2, 4, 4 ]
+            num_res_blocks: 2
+            attn_resolutions: [ ]
+            dropout: 0.0
+            video_kernel_size: [ 3, 1, 1 ]
+
+    scheduler_config:
+      target: vista.vwm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 1000 ]
+        cycle_lengths: [ 10000000000000 ]
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    loss_fn_config:
+      target: vista.vwm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        use_additional_loss: True
+        offset_noise_level: 0.02
+        additional_loss_weight: 0.1
+        num_frames: *num_frames
+        replace_cond_frames: *replace_cond_frames
+        cond_frames_choices:
+          - [ ]
+          - [ 0 ]
+          - [ 0, 1 ]
+          - [ 0, 1, 2 ]
+
+        sigma_sampler_config:
+          target: vista.vwm.modules.diffusionmodules.sigma_sampling.EDMSampling
+          params:
+            p_mean: 1.0
+            p_std: 1.6
+            num_frames: *num_frames
+
+        loss_weighting_config:
+          target: vista.vwm.modules.diffusionmodules.loss_weighting.VWeighting
+
+    sampler_config:
+      target: vista.vwm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 15
+
+        discretization_config:
+          target: vista.vwm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: vista.vwm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            num_frames: *num_frames
+            max_scale: 3.0
+            min_scale: 1.5
+
+data:
+  target: vista.vwm.data.dataset.Sampler
+  params:
+    batch_size: 1
+    num_workers: 16
+    subsets:
+      - YouTube
+      - NuScenes
+    probs:
+      - 1
+      - 1
+    samples_per_epoch: 256000
+    target_height: 576
+    target_width: 1024
+    num_frames: *num_frames
+
+lightning:
+  callbacks:
+    image_logger:
+      target: train.ImageLogger
+      params:
+        num_frames: *num_frames
+        disabled: False
+        enable_autocast: False
+        batch_frequency: 1000
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          N: *num_frames
+
+  modelcheckpoint:
+    params:
+      every_n_epochs: 1  # every_n_train_steps: 5000, set the same as image_logger batch_frequency
+
+  trainer:
+    devices: 0,1
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 100
+    strategy: deepspeed_stage_2
+    gradient_clip_val: 0.3
diff --git a/vista/docs/INSTALL.md b/vista/docs/INSTALL.md
new file mode 100644
index 0000000000000000000000000000000000000000..6059d42aad6c117d1f82d3cc0ce8b891495bf052
--- /dev/null
+++ b/vista/docs/INSTALL.md
@@ -0,0 +1,48 @@
+## Installation
+
+- ### Requirement
+
+  Our experiments are conducted with **PyTorch 2.0.1**, **CUDA 11.7**, **Ubuntu 22.04**, and **NVIDIA Tesla A100** (80 GB). For other requirements, please check [TRAINING.md](https://github.com/OpenDriveLab/Vista/blob/main/docs/TRAINING.md) and [SAMPLING.md](https://github.com/OpenDriveLab/Vista/blob/main/docs/SAMPLING.md).
+
+- ### Preparation
+
+  Clone the repository to your local directory.
+
+  ```shell
+  git clone https://github.com/OpenDriveLab/Vista.git
+  ```
+
+  We provide an example on nuScenes dataset for training and sampling. Before you start, make sure you have:
+  
+  - Downloaded the translated action annotations from [here](https://drive.google.com/drive/folders/1JpZObdR0OXagCbnPZfMSI8vhGLom5pht?usp=sharing) and put the JSON files into `annos`.
+    
+  - Downloaded all splits of **Trainval** in **Full dataset (v1.0)** to your device following [official instructions](https://www.nuscenes.org/download). After downloading, it should contain:
+
+    ```
+    $
+    ├── samples
+    ├── sweeps
+    ├── ...
+    └── v1.0-trainval
+    ```
+
+- ### Installation
+
+  - We use conda to manage the environment.
+
+    ```shell
+    conda create -n vista python=3.9 -y
+    conda activate vista
+    ```
+  
+  - Install dependencies.
+  
+    ```shell
+    conda install -y pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.7 -c pytorch -c nvidia
+    pip3 install -r requirements.txt
+    pip3 install -e git+https://github.com/Stability-AI/datapipelines.git@main#egg=sdata
+    ```
+
+---
+
+=> Next: [[Training](https://github.com/OpenDriveLab/Vista/blob/main/docs/TRAINING.md)]
diff --git a/vista/docs/ISSUES.md b/vista/docs/ISSUES.md
new file mode 100644
index 0000000000000000000000000000000000000000..cfc64fbd51510fbdc13a64429884f2c0803a1371
--- /dev/null
+++ b/vista/docs/ISSUES.md
@@ -0,0 +1,36 @@
+## Trouble Shooting
+
+1. #### Out of memory during sampling.
+
+   - Possible reason:
+     - Too many high-resolution frames for parallel decoding. The default setting will request ca. 66 GB peak VARM.
+
+   - Try this:
+     - Reduce the number of jointly decoded frames *en_and_decode_n_samples_a_time* in `inference/vista.yaml`.
+
+2. #### Get stuck at loading FrozenCLIPEmbedder or FrozenOpenCLIPImageEmbedder.
+
+   - Possible reason:
+     - A network failure.
+   
+   - Try this:
+     1. Download [openai/clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14/tree/main) and [laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/tree/main) in advance.
+     2. Set *version* of FrozenCLIPEmbedder and FrozenOpenCLIPImageEmbedder in `vwm/modules/encoders/modules.py` to the new paths of `pytorch_model.bin`.
+
+3. #### Datasets not yet available during training.
+
+   - Possible reason:
+
+     - The installed [sdata](https://github.com/Stability-AI/datapipelines) is not detected.
+
+   - Try this:
+
+     - Reinstall in the current project directory.
+
+       ````shell
+       pip3 install -e git+https://github.com/Stability-AI/datapipelines.git@main#egg=sdata
+       ````
+
+---
+
+<= Previous: [[Sampling](https://github.com/OpenDriveLab/Vista/blob/main/docs/SAMPLING.md)]
diff --git a/vista/docs/SAMPLING.md b/vista/docs/SAMPLING.md
new file mode 100644
index 0000000000000000000000000000000000000000..59fe474701fbd5038bc6eea77e4e7e213c540688
--- /dev/null
+++ b/vista/docs/SAMPLING.md
@@ -0,0 +1,60 @@
+## Sampling
+
+- ### Requirement
+
+  Currently, we suggest using Nvidia GPUs with a minimum of **32 GB** VRAM for sampling. Check [ISSUES.md](https://github.com/OpenDriveLab/Vista/blob/main/docs/ISSUES.md) if you do not have enough memory.
+
+- ### Preparation
+
+  Make sure you have downloaded `vista.safetensors` from [Hugging Face](https://huggingface.co/OpenDriveLab/Vista/blob/main/vista.safetensors) or [Google Drive](https://drive.google.com/file/d/1bCM7XLDquRqnnpauQAK5j1jP-n0y1ama/view). Move (or link) the checkpoint into `ckpts`.
+
+- ### Future Prediction
+
+  - We provide a sampling example for nuScenes. Make sure to prepare the dataset as [INSTALL.md](https://github.com/OpenDriveLab/Vista/blob/main/docs/INSTALL.md) and replace the correct *data_root* in `sample.py`.
+
+    - Short-term action-free prediction.
+
+      ```shell
+      python sample.py
+      ```
+
+    - Long-term rollout.
+    
+      ```shell
+      python sample.py --n_rounds 6
+      ```
+    
+    - Action-conditioned simulation (take trajectory as an example).
+    
+      ```shell
+      python sample.py --action traj
+      ```
+      
+    > Make sure the loaded checkpoint strictly match all parameters. Otherwise, you may get a sequence of blur.
+
+  - Important arguments:
+
+    - `--dataset`: You can also customize the scenes by providing other driving views within a folder of images. They will serve as the initial frames for prediction when you set `--dataset` to "IMG".
+    - `--action`: The mode of control inputs. By default, we perform action-free prediction. You can try different actions using "traj", "cmd", "steer", or "goal". It will import ground truth actions (if available), but you can enforce any actions by making slight modifications.
+    - `--n_rounds`: The number of sampling rounds, which determines the duration to predict. You can increase it to perform long-horizon rollout. Each additional round extends the prediction by 2.3 seconds.
+    - `--n_steps`: The number of DDIM sampling steps, which can be reduced for efficiency.
+    - `--rand_gen`: Whether to generate samples randomly selected from the whole dataset or go through all samples one by one. 
+    - `--low_vram`: Enable the low VRAM mode if you are using a GPU with less than 80 GB VRAM.
+
+- ### Reward Estimation
+
+  - We provide a simplified example to estimate the rewards on nuScenes. Make sure to replace the correct *data_root* in `reward.py`.
+
+    ```shell
+    python reward.py
+    ```
+
+  - Important arguments:
+  
+    - `--ens_size`: The number of samples to generate per case (initial frame and action condition).
+
+---
+
+<= Previous: [[Training](https://github.com/OpenDriveLab/Vista/blob/main/docs/TRAINING.md)]
+
+=> Next: [[Trouble Shooting](https://github.com/OpenDriveLab/Vista/blob/main/docs/ISSUES.md)]
diff --git a/vista/docs/TRAINING.md b/vista/docs/TRAINING.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3ac0e44eb9138d84dfab3da14dbcc4ee856c3a2
--- /dev/null
+++ b/vista/docs/TRAINING.md
@@ -0,0 +1,113 @@
+## Training
+
+- ### Requirement
+
+  Nvidia GPUs with **80 GB** VRAM are required for training, but you can train low-resolution variants on smaller GPUs.
+
+- ### Preparation
+
+  Download the pretrained `svd_xt.safetensors` from [Hugging Face](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/svd_xt.safetensors) and place the checkpoint into `ckpts`.
+
+- ### Training (example)
+
+  - We take **nuScenes** dataset as an example for training. After finishing the setups in [INSTALL.md](https://github.com/OpenDriveLab/Vista/blob/main/docs/INSTALL.md), remember to edit *data_root* in `vwm/data/subsets/nuscenes.py` to the proper path of nuScenes.
+
+  - We use DeepSpeed ZeRO stage 2 to improve data parallelism and lower memory footprint during training. The training can be launched as: 
+
+    - Distributed training (suppose you train with 2 nodes, and each node has 8 GPUs).
+
+      ```shell
+      torchrun \
+          --nnodes=2 \
+          --nproc_per_node=8 \
+          train.py \
+          --base configs/example/nusc_train.yaml \
+          --num_nodes 2 \
+          --n_devices 8
+      ```
+
+    - Single GPU debugging (too slow, not recommended for training).
+
+      ```shell
+      python train.py --num_nodes 1 --n_devices 1
+      ```
+
+    > The training logs, including visualization samples and model checkpoints, will be saved in the project directory by default. Given that the size of checkpoints could be very large, you can set another directory to save these logs by providing an available path to `--logdir`.
+    >
+    > You can disable `--no_test` to test a bunch of samples for every checkpoint, but we recommend evaluating them offline for flexible comparison and uninterrupted training.
+
+  - After training, switch to the log directory with the model checkpoint. You should find a Python script named `zero_to_fp32.py` and a `checkpoint` folder that contains all partitioned checkpoints. The final checkpoint can be obtained by:
+
+    1. [*if you only want to resume training*] Merge the partitioned checkpoints as `pytorch_model.bin` using `zero_to_fp32.py`.
+    
+       ```shell
+       python zero_to_fp32.py . pytorch_model.bin
+       ```
+    
+    2. [*if you also want to do inference*] Navigate into the project root, and use `bin_to_st.py` to convert the resulting `path_to/pytorch_model.bin` to `ckpts/vista.safetensors`.
+
+- ### Training of Vista
+
+  - Download **OpenDV-YouTube** dataset (or a part of it) from [DriveAGI](https://github.com/OpenDriveLab/DriveAGI#genad-dataset-opendv-youtube). You can refer to the structure in `vwm/data/subsets/youtube.py` to organize the dataset.
+  
+  - #### Phase 1: learning high-fidelity future prediction
+  
+    - This phase uses unlabeled OpenDV-YouTube for training.
+  
+    - The model is trained at a resolution of 576x1024 on 128 GPUs for 20K iterations with gradient accumulation.
+  
+      ```shell
+      torchrun \
+          --nnodes=16 \
+          --nproc_per_node=8 \
+          train.py \
+          --base configs/training/vista_phase1.yaml \
+          --num_nodes 16 \
+          --n_devices 8
+      ```
+  
+    - We pause the training after the effect of dynamics priors can be witnessed. The last checkpoint is merged for the training of next phase.
+  
+  - #### Phase 2:  learning versatile action controllability
+  
+    - This phase uses OpenDV-YouTube and nuScenes for collaborative training.
+  
+    - ##### Stage 1: low-resolution training
+  
+      - The model is finetuned at a resolution of 320x576 on 8 GPUs for 120K iterations.
+  
+        ```shell
+        torchrun \
+            --nnodes=1 \
+            --nproc_per_node=8 \
+            train.py \
+            --base configs/training/vista_phase2_stage1.yaml \
+            --finetune ${PATH_TO_PHASE1_CKPT}/pytorch_model.bin \
+            --num_nodes 1 \
+            --n_devices 8
+        ```
+  
+      - We pause the training after the controllability can be clearly witnessed. The last checkpoint is merged for the training of next stage.
+  
+    - ##### Stage 2: high-resolution training
+  
+      - The model is finetuned at a resolution of 576x1024 on 8 GPUs for another 10K iterations.
+  
+        ```shell
+        torchrun \
+            --nnodes=1 \
+            --nproc_per_node=8 \
+            train.py \
+            --base configs/training/vista_phase2_stage2.yaml \
+            --finetune ${PATH_TO_STAGE1_CKPT}/pytorch_model.bin \
+            --num_nodes 1 \
+            --n_devices 8
+        ```
+  
+      - We pause the training after the model adapt to the desired resolution. The last checkpoint is merged for application.
+
+---
+
+<= Previous: [[Installation](https://github.com/OpenDriveLab/Vista/blob/main/docs/INSTALL.md)]
+
+=> Next: [[Sampling](https://github.com/OpenDriveLab/Vista/blob/main/docs/SAMPLING.md)]
diff --git a/vista/reward.py b/vista/reward.py
new file mode 100644
index 0000000000000000000000000000000000000000..93e800f2b2d796a5b0c15350cc949265822916cd
--- /dev/null
+++ b/vista/reward.py
@@ -0,0 +1,266 @@
+from __future__ import annotations
+
+import argparse
+import json
+import random
+
+from pytorch_lightning import seed_everything
+from reward_utils import *
+from torchvision import transforms
+
+VERSION2SPECS = {
+    "vwm": {
+        "config": "configs/inference/vista.yaml",
+        "ckpt": "ckpts/vista.safetensors"
+    }
+}
+
+DATASET2SOURCES = {
+    "NUSCENES": {
+        "data_root": "data/nuscenes",
+        "anno_file": "annos/nuScenes_val.json"
+    },
+    "IMG": {
+        "data_root": "image_folder"
+    }
+}
+
+
+def parse_args(**parser_kwargs):
+    parser = argparse.ArgumentParser(**parser_kwargs)
+    parser.add_argument(
+        "--version",
+        type=str,
+        default="vwm",
+        help="model version"
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="NUSCENES",
+        help="dataset name"
+    )
+    parser.add_argument(
+        "--save",
+        type=str,
+        default="outputs",
+        help="directory to save samples"
+    )
+    parser.add_argument(
+        "--action",
+        type=str,
+        default="traj",
+        help="action mode for control, such as traj, cmd, steer, goal"
+    )
+    parser.add_argument(
+        "--n_frames",
+        type=int,
+        default=25,
+        help="number of frames for each round"
+    )
+    parser.add_argument(
+        "--n_conds",
+        type=int,
+        default=1,
+        help="number of initial condition frames for the first round"
+    )
+    parser.add_argument(
+        "--ens_size",
+        type=int,
+        default=5,
+        help="number of samples per case"
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=23,
+        help="random seed for seed_everything"
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=576,
+        help="target height of the generated video"
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=1024,
+        help="target width of the generated video"
+    )
+    parser.add_argument(
+        "--cfg_scale",
+        type=float,
+        default=2.5,
+        help="scale of the classifier-free guidance"
+    )
+    parser.add_argument(
+        "--cond_aug",
+        type=float,
+        default=0.0,
+        help="strength of the noise augmentation"
+    )
+    parser.add_argument(
+        "--n_steps",
+        type=int,
+        default=10,
+        help="number of sampling steps"
+    )
+    parser.add_argument(
+        "--rand_gen",
+        action="store_false",
+        help="whether to generate samples randomly or sequentially"
+    )
+    parser.add_argument(
+        "--low_vram",
+        action="store_true",
+        help="whether to save memory or not"
+    )
+    return parser
+
+
+def get_sample(selected_index=0, dataset_name="NUSCENES", num_frames=25, action_mode="free"):
+    dataset_dict = DATASET2SOURCES[dataset_name]
+    action_dict = None
+    if dataset_name == "IMG":
+        image_list = os.listdir(dataset_dict["data_root"])
+        total_length = len(image_list)
+        while selected_index >= total_length:
+            selected_index -= total_length
+        image_file = image_list[selected_index]
+
+        path_list = [os.path.join(dataset_dict["data_root"], image_file)] * num_frames
+    else:
+        with open(dataset_dict["anno_file"]) as anno_json:
+            all_samples = json.load(anno_json)
+        total_length = len(all_samples)
+        while selected_index >= total_length:
+            selected_index -= total_length
+        sample_dict = all_samples[selected_index]
+
+        path_list = list()
+        if dataset_name == "NUSCENES":
+            for index in range(num_frames):
+                image_path = os.path.join(dataset_dict["data_root"], sample_dict["frames"][index])
+                assert os.path.exists(image_path), image_path
+                path_list.append(image_path)
+            action_dict = dict()
+            if action_mode == "traj" or action_mode == "trajectory":
+                action_dict["trajectory"] = torch.tensor(sample_dict["traj"][2:])
+            elif action_mode == "cmd" or action_mode == "command":
+                action_dict["command"] = torch.tensor(sample_dict["cmd"])
+            elif action_mode == "steer":
+                # scene might be empty
+                if sample_dict["speed"]:
+                    action_dict["speed"] = torch.tensor(sample_dict["speed"][1:])
+                # scene might be empty
+                if sample_dict["angle"]:
+                    action_dict["angle"] = torch.tensor(sample_dict["angle"][1:]) / 780
+            elif action_mode == "goal":
+                # point might be invalid
+                if sample_dict["z"] > 0 and 0 < sample_dict["goal"][0] < 1600 and 0 < sample_dict["goal"][1] < 900:
+                    action_dict["goal"] = torch.tensor([
+                        sample_dict["goal"][0] / 1600,
+                        sample_dict["goal"][1] / 900
+                    ])
+            else:
+                raise ValueError(f"Unsupported action mode {action_mode}")
+        else:
+            raise ValueError(f"Invalid dataset {dataset_name}")
+    return path_list, selected_index, total_length, action_dict
+
+
+def load_img(file_name, target_height=320, target_width=576, device="cuda"):
+    if file_name is not None:
+        image = Image.open(file_name)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+    else:
+        raise ValueError(f"Invalid image file {file_name}")
+    ori_w, ori_h = image.size
+    # print(f"Loaded input image of size ({ori_w}, {ori_h})")
+
+    if ori_w / ori_h > target_width / target_height:
+        tmp_w = int(target_width / target_height * ori_h)
+        left = (ori_w - tmp_w) // 2
+        right = (ori_w + tmp_w) // 2
+        image = image.crop((left, 0, right, ori_h))
+    elif ori_w / ori_h < target_width / target_height:
+        tmp_h = int(target_height / target_width * ori_w)
+        top = (ori_h - tmp_h) // 2
+        bottom = (ori_h + tmp_h) // 2
+        image = image.crop((0, top, ori_w, bottom))
+    image = image.resize((target_width, target_height), resample=Image.LANCZOS)
+    if not image.mode == "RGB":
+        image = image.convert("RGB")
+    image = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x * 2.0 - 1.0)
+    ])(image)
+    return image.to(device)
+
+
+if __name__ == "__main__":
+    parser = parse_args()
+    opt, unknown = parser.parse_known_args()
+
+    set_lowvram_mode(opt.low_vram)
+    version_dict = VERSION2SPECS[opt.version]
+    model = init_model(version_dict)
+    unique_keys = set([x.input_key for x in model.conditioner.embedders])
+
+    sample_index = 0
+    while sample_index >= 0:
+        seed_everything(opt.seed)
+
+        frame_list, sample_index, dataset_length, action_dict = get_sample(sample_index,
+                                                                           opt.dataset,
+                                                                           opt.n_frames,
+                                                                           opt.action)
+
+        img_seq = list()
+        for each_path in frame_list:
+            img = load_img(each_path, opt.height, opt.width)
+            img_seq.append(img)
+        images = torch.stack(img_seq)
+
+        value_dict = init_embedder_options(unique_keys)
+        cond_img = img_seq[0][None]
+        value_dict["cond_frames_without_noise"] = cond_img
+        value_dict["cond_aug"] = opt.cond_aug
+        value_dict["cond_frames"] = cond_img + opt.cond_aug * torch.randn_like(cond_img)
+        if action_dict is not None:
+            for key, value in action_dict.items():
+                value_dict[key] = value
+
+        guider = "VanillaCFG"
+        sampler = init_sampling(guider=guider, steps=opt.n_steps, cfg_scale=opt.cfg_scale, num_frames=opt.n_frames)
+
+        uc_keys = ["cond_frames", "cond_frames_without_noise", "command", "trajectory", "speed", "angle", "goal"]
+
+        out = do_sample(
+            images,
+            model,
+            sampler,
+            value_dict,
+            num_frames=opt.n_frames,
+            ensemble_size=opt.ens_size,
+            force_uc_zero_embeddings=uc_keys,
+            initial_cond_indices=[index for index in range(opt.n_conds)]
+        )
+
+        if isinstance(out, (tuple, list)):
+            inputs, reward = out
+            real_path = os.path.join(opt.save, "real")
+            perform_save_locally(real_path, inputs, "videos", opt.dataset, sample_index)
+            perform_save_locally(real_path, inputs, "grids", opt.dataset, sample_index)
+            perform_save_locally(real_path, inputs, "images", opt.dataset, sample_index)
+        else:
+            raise TypeError
+
+        if opt.rand_gen:
+            sample_index += random.randint(1, dataset_length - 1)
+        else:
+            sample_index += 1
+            if dataset_length <= sample_index:
+                sample_index = -1
diff --git a/vista/reward_utils.py b/vista/reward_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..110f6ce2f4039032ff95f11529af5cb928573ca4
--- /dev/null
+++ b/vista/reward_utils.py
@@ -0,0 +1,342 @@
+from __future__ import annotations
+
+import math
+import os
+from typing import Optional, Union
+
+import numpy as np
+import torch
+import torchvision
+from einops import rearrange, repeat
+from omegaconf import ListConfig, OmegaConf
+from PIL import Image
+from safetensors.torch import load_file as load_safetensors
+from torch import autocast
+from train import save_img_seq_to_video
+from vwm.modules.diffusionmodules.sampling import EulerEDMSampler
+from vwm.util import default, instantiate_from_config
+
+
+def init_model(version_dict, load_ckpt=True):
+    config = OmegaConf.load(version_dict["config"])
+    model = load_model_from_config(config, version_dict["ckpt"] if load_ckpt else None)
+    return model
+
+
+lowvram_mode = True
+
+
+def set_lowvram_mode(mode):
+    global lowvram_mode
+    lowvram_mode = mode
+
+
+def initial_model_load(model):
+    global lowvram_mode
+    if lowvram_mode:
+        model.model.half()
+    else:
+        model.cuda()
+    return model
+
+
+def load_model(model):
+    model.cuda()
+
+
+def unload_model(model):
+    global lowvram_mode
+    if lowvram_mode:
+        model.cpu()
+        torch.cuda.empty_cache()
+
+
+def load_model_from_config(config, ckpt=None):
+    model = instantiate_from_config(config.model)
+
+    if ckpt is not None:
+        print(f"Loading model from {ckpt}")
+        if ckpt.endswith("ckpt"):
+            pl_svd = torch.load(ckpt, map_location="cpu")
+            # dict contains:
+            # "epoch", "global_step", "pytorch-lightning_version",
+            # "state_dict", "loops", "callbacks", "optimizer_states", "lr_schedulers"
+            if "global_step" in pl_svd:
+                print(f"Global step: {pl_svd['global_step']}")
+            svd = pl_svd["state_dict"]
+        elif ckpt.endswith("safetensors"):
+            svd = load_safetensors(ckpt)
+        else:
+            raise NotImplementedError("Please convert the checkpoint to safetensors first")
+
+        missing, unexpected = model.load_state_dict(svd, strict=False)
+        if len(missing) > 0:
+            print(f"Missing keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected keys: {unexpected}")
+
+    model = initial_model_load(model)
+    model.eval()
+    return model
+
+
+def init_embedder_options(keys):
+    # hardcoded demo settings, might undergo some changes in the future
+    value_dict = dict()
+    for key in keys:
+        if key in ["fps_id", "fps"]:
+            fps = 10
+            value_dict["fps"] = fps
+            value_dict["fps_id"] = fps - 1
+        elif key == "motion_bucket_id":
+            value_dict["motion_bucket_id"] = 127  # [0, 511]
+    return value_dict
+
+
+def perform_save_locally(save_path, samples, mode, dataset_name, sample_index):
+    assert mode in ["images", "grids", "videos"]
+    merged_path = os.path.join(save_path, mode)
+    os.makedirs(merged_path, exist_ok=True)
+    samples = samples.cpu()
+
+    if mode == "images":
+        frame_count = 0
+        for sample in samples:
+            sample = rearrange(sample.numpy(), "c h w -> h w c")
+            if "real" in save_path:
+                sample = 255.0 * (sample + 1.0) / 2.0
+            else:
+                sample = 255.0 * sample
+            image_save_path = os.path.join(merged_path, f"{dataset_name}_{sample_index:06}_{frame_count:04}.png")
+            # if os.path.exists(image_save_path):
+            #     return
+            Image.fromarray(sample.astype(np.uint8)).save(image_save_path)
+            frame_count += 1
+    elif mode == "grids":
+        grid = torchvision.utils.make_grid(samples, nrow=int(samples.shape[0] ** 0.5))
+        grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1).numpy()
+        if "real" in save_path:
+            grid = 255.0 * (grid + 1.0) / 2.0
+        else:
+            grid = 255.0 * grid
+        grid_save_path = os.path.join(merged_path, f"{dataset_name}_{sample_index:06}.png")
+        # if os.path.exists(grid_save_path):
+        #     return
+        Image.fromarray(grid.astype(np.uint8)).save(grid_save_path)
+    elif mode == "videos":
+        img_seq = rearrange(samples.numpy(), "t c h w -> t h w c")
+        if "real" in save_path:
+            img_seq = 255.0 * (img_seq + 1.0) / 2.0
+        else:
+            img_seq = 255.0 * img_seq
+        video_save_path = os.path.join(merged_path, f"{dataset_name}_{sample_index:06}.mp4")
+        # if os.path.exists(video_save_path):
+        #     return
+        save_img_seq_to_video(video_save_path, img_seq.astype(np.uint8), 10)
+    else:
+        raise NotImplementedError
+
+
+def init_sampling(sampler="EulerEDMSampler", guider="VanillaCFG", discretization="EDMDiscretization",
+                  steps=50, cfg_scale=2.5, num_frames=25):
+    discretization_config = get_discretization(discretization)
+    guider_config = get_guider(guider, cfg_scale, num_frames)
+    sampler = get_sampler(sampler, steps, discretization_config, guider_config)
+    return sampler
+
+
+def get_discretization(discretization):
+    if discretization == "LegacyDDPMDiscretization":
+        discretization_config = {
+            "target": "vista.vwm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization"
+        }
+    elif discretization == "EDMDiscretization":
+        discretization_config = {
+            "target": "vista.vwm.modules.diffusionmodules.discretizer.EDMDiscretization",
+            "params": {
+                "sigma_min": 0.002,
+                "sigma_max": 700.0,
+                "rho": 7.0
+            }
+        }
+    else:
+        raise NotImplementedError
+    return discretization_config
+
+
+def get_guider(guider="LinearPredictionGuider", cfg_scale=2.5, num_frames=25):
+    if guider == "IdentityGuider":
+        guider_config = {
+            "target": "vista.vwm.modules.diffusionmodules.guiders.IdentityGuider"
+        }
+    elif guider == "VanillaCFG":
+        scale = cfg_scale
+
+        guider_config = {
+            "target": "vista.vwm.modules.diffusionmodules.guiders.VanillaCFG",
+            "params": {
+                "scale": scale
+            }
+        }
+    elif guider == "LinearPredictionGuider":
+        max_scale = cfg_scale
+        min_scale = 1.0
+
+        guider_config = {
+            "target": "vista.vwm.modules.diffusionmodules.guiders.LinearPredictionGuider",
+            "params": {
+                "max_scale": max_scale,
+                "min_scale": min_scale,
+                "num_frames": num_frames
+            }
+        }
+    elif guider == "TrianglePredictionGuider":
+        max_scale = cfg_scale
+        min_scale = 1.0
+
+        guider_config = {
+            "target": "vista.vwm.modules.diffusionmodules.guiders.TrianglePredictionGuider",
+            "params": {
+                "max_scale": max_scale,
+                "min_scale": min_scale,
+                "num_frames": num_frames
+            }
+        }
+    else:
+        raise NotImplementedError
+    return guider_config
+
+
+def get_sampler(sampler, steps, discretization_config, guider_config):
+    if sampler == "EulerEDMSampler":
+        s_churn = 0.0
+        s_tmin = 0.0
+        s_tmax = 999.0
+        s_noise = 1.0
+
+        sampler = EulerEDMSampler(
+            num_steps=steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=s_churn,
+            s_tmin=s_tmin,
+            s_tmax=s_tmax,
+            s_noise=s_noise,
+            verbose=False
+        )
+    else:
+        raise ValueError(f"Unknown sampler {sampler}")
+    return sampler
+
+
+def get_batch(keys, value_dict, N: Union[list, ListConfig], device="cuda"):
+    # hardcoded demo setups, might undergo some changes in the future
+    batch = dict()
+    batch_uc = dict()
+
+    for key in keys:
+        if key in value_dict:
+            if key in ["fps", "fps_id", "motion_bucket_id", "cond_aug"]:
+                batch[key] = repeat(torch.tensor([value_dict[key]]).to(device), "1 -> b", b=math.prod(N))
+            elif key in ["command", "trajectory", "speed", "angle", "goal"]:
+                batch[key] = repeat(value_dict[key][None].to(device), "1 ... -> b ...", b=N[0])
+            elif key in ["cond_frames", "cond_frames_without_noise"]:
+                batch[key] = repeat(value_dict[key], "1 ... -> b ...", b=N[0])
+            else:
+                # batch[key] = value_dict[key]
+                raise NotImplementedError
+
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+
+
+def get_condition(model, value_dict, num_samples, force_uc_zero_embeddings, device):
+    load_model(model.conditioner)
+    batch, batch_uc = get_batch(
+        list(set([x.input_key for x in model.conditioner.embedders])),
+        value_dict,
+        [num_samples]
+    )
+    c, uc = model.conditioner.get_unconditional_conditioning(
+        batch,
+        batch_uc=batch_uc,
+        force_uc_zero_embeddings=force_uc_zero_embeddings
+    )
+    unload_model(model.conditioner)
+
+    for k in c:
+        if isinstance(c[k], torch.Tensor):
+            c[k], uc[k] = map(lambda y: y[k][:num_samples].to(device), (c, uc))
+            if c[k].shape[0] < num_samples:
+                c[k] = c[k][[0]]
+            if uc[k].shape[0] < num_samples:
+                uc[k] = uc[k][[0]]
+    return c, uc
+
+
+def fill_latent(cond, length, cond_indices, device):
+    latent = torch.zeros(length, *cond.shape[1:]).to(device)
+    latent[cond_indices] = cond
+    return latent
+
+
+@torch.no_grad()
+def do_sample(
+        images,
+        model,
+        sampler,
+        value_dict,
+        num_frames,
+        ensemble_size: int = 5,
+        force_uc_zero_embeddings: Optional[list] = None,
+        initial_cond_indices: Optional[list] = None,
+        device="cuda"
+):
+    if initial_cond_indices is None:
+        initial_cond_indices = [0]
+
+    force_uc_zero_embeddings = default(force_uc_zero_embeddings, list())
+    precision_scope = autocast
+
+    with torch.no_grad(), precision_scope(device), model.ema_scope("Sampling"):
+        load_model(model.first_stage_model)
+        z = model.encode_first_stage(images)
+        unload_model(model.first_stage_model)
+
+        def denoiser(x, sigma, cond, cond_mask):
+            return model.denoiser(model.model, x, sigma, cond, cond_mask)
+
+        load_model(model.denoiser)
+        load_model(model.model)
+
+        initial_cond_mask = torch.zeros(num_frames).to(device)
+        initial_cond_mask[initial_cond_indices] = 1
+
+        c, uc = get_condition(model, value_dict, num_frames, force_uc_zero_embeddings, device)
+
+        sample_ensemble = list()
+        for _ in range(ensemble_size):
+            noise = torch.randn_like(z)
+            sample = sampler(
+                denoiser,
+                noise,
+                cond=c,
+                uc=uc,
+                cond_frame=z,  # cond_frame will be rescaled when calling the sampler
+                cond_mask=initial_cond_mask
+            )
+            sample[0] = z[0]
+            sample_ensemble.append(sample)
+
+        u = torch.mean(torch.stack(sample_ensemble), 0)
+        diff = torch.zeros_like(sample)
+        for each_sample in sample_ensemble:
+            diff.add_((each_sample - u) ** 2)
+        variance = diff / (ensemble_size - 1)
+        reward = torch.exp(-variance.mean()).cpu()
+
+        unload_model(model.model)
+        unload_model(model.denoiser)
+        return images, reward
diff --git a/vista/sample.py b/vista/sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..efc60fd8dfb2cb28bbccb1df4d0d7310fea96170
--- /dev/null
+++ b/vista/sample.py
@@ -0,0 +1,269 @@
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import random
+
+import PIL
+import torch
+from pytorch_lightning import seed_everything
+from torchvision import transforms
+
+from . import sample_utils
+
+VERSION2SPECS = {
+    "vwm": {"config": "configs/inference/vista.yaml", "ckpt": "ckpts/vista.safetensors"}
+}
+
+DATASET2SOURCES = {
+    "NUSCENES": {"data_root": "data/nuscenes", "anno_file": "annos/nuScenes_val.json"},
+    "IMG": {"data_root": "image_folder"},
+}
+
+
+def parse_args(**parser_kwargs):
+    parser = argparse.ArgumentParser(**parser_kwargs)
+    parser.add_argument("--version", type=str, default="vwm", help="model version")
+    parser.add_argument("--dataset", type=str, default="NUSCENES", help="dataset name")
+    parser.add_argument(
+        "--save", type=str, default="outputs", help="directory to save samples"
+    )
+    parser.add_argument(
+        "--action",
+        type=str,
+        default="free",
+        help="action mode for control, such as traj, cmd, steer, goal",
+    )
+    parser.add_argument(
+        "--n_rounds", type=int, default=1, help="number of sampling rounds"
+    )
+    parser.add_argument(
+        "--n_frames", type=int, default=25, help="number of frames for each round"
+    )
+    parser.add_argument(
+        "--n_conds",
+        type=int,
+        default=1,
+        help="number of initial condition frames for the first round",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=23, help="random seed for seed_everything"
+    )
+    parser.add_argument(
+        "--height", type=int, default=576, help="target height of the generated video"
+    )
+    parser.add_argument(
+        "--width", type=int, default=1024, help="target width of the generated video"
+    )
+    parser.add_argument(
+        "--cfg_scale",
+        type=float,
+        default=2.5,
+        help="scale of the classifier-free guidance",
+    )
+    parser.add_argument(
+        "--cond_aug", type=float, default=0.0, help="strength of the noise augmentation"
+    )
+    parser.add_argument(
+        "--n_steps", type=int, default=50, help="number of sampling steps"
+    )
+    parser.add_argument(
+        "--rand_gen",
+        action="store_false",
+        help="whether to generate samples randomly or sequentially",
+    )
+    parser.add_argument(
+        "--low_vram", action="store_true", help="whether to save memory or not"
+    )
+    return parser
+
+
+def get_sample(
+    selected_index=0, dataset_name="NUSCENES", num_frames=25, action_mode="free"
+):
+    dataset_dict = DATASET2SOURCES[dataset_name]
+    action_dict = None
+    if dataset_name == "IMG":
+        image_list = os.listdir(dataset_dict["data_root"])
+        total_length = len(image_list)
+        while selected_index >= total_length:
+            selected_index -= total_length
+        image_file = image_list[selected_index]
+
+        path_list = [os.path.join(dataset_dict["data_root"], image_file)] * num_frames
+    else:
+        with open(dataset_dict["anno_file"]) as anno_json:
+            all_samples = json.load(anno_json)
+        total_length = len(all_samples)
+        while selected_index >= total_length:
+            selected_index -= total_length
+        sample_dict = all_samples[selected_index]
+
+        path_list = list()
+        if dataset_name == "NUSCENES":
+            for index in range(num_frames):
+                image_path = os.path.join(
+                    dataset_dict["data_root"], sample_dict["frames"][index]
+                )
+                assert os.path.exists(image_path), image_path
+                path_list.append(image_path)
+            if action_mode != "free":
+                action_dict = dict()
+                if action_mode == "traj" or action_mode == "trajectory":
+                    action_dict["trajectory"] = torch.tensor(sample_dict["traj"][2:])
+                elif action_mode == "cmd" or action_mode == "command":
+                    action_dict["command"] = torch.tensor(sample_dict["cmd"])
+                elif action_mode == "steer":
+                    # scene might be empty
+                    if sample_dict["speed"]:
+                        action_dict["speed"] = torch.tensor(sample_dict["speed"][1:])
+                    # scene might be empty
+                    if sample_dict["angle"]:
+                        action_dict["angle"] = (
+                            torch.tensor(sample_dict["angle"][1:]) / 780
+                        )
+                elif action_mode == "goal":
+                    # point might be invalid
+                    if (
+                        sample_dict["z"] > 0
+                        and 0 < sample_dict["goal"][0] < 1600
+                        and 0 < sample_dict["goal"][1] < 900
+                    ):
+                        action_dict["goal"] = torch.tensor(
+                            [
+                                sample_dict["goal"][0] / 1600,
+                                sample_dict["goal"][1] / 900,
+                            ]
+                        )
+                else:
+                    raise ValueError(f"Unsupported action mode {action_mode}")
+        else:
+            raise ValueError(f"Invalid dataset {dataset_name}")
+    return path_list, selected_index, total_length, action_dict
+
+
+def load_img(file_name, target_height=320, target_width=576, device="cuda"):
+    if file_name is not None:
+        image = PIL.Image.open(file_name)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+    else:
+        raise ValueError(f"Invalid image file {file_name}")
+    ori_w, ori_h = image.size
+    # print(f"Loaded input image of size ({ori_w}, {ori_h})")
+
+    if ori_w / ori_h > target_width / target_height:
+        tmp_w = int(target_width / target_height * ori_h)
+        left = (ori_w - tmp_w) // 2
+        right = (ori_w + tmp_w) // 2
+        image = image.crop((left, 0, right, ori_h))
+    elif ori_w / ori_h < target_width / target_height:
+        tmp_h = int(target_height / target_width * ori_w)
+        top = (ori_h - tmp_h) // 2
+        bottom = (ori_h + tmp_h) // 2
+        image = image.crop((0, top, ori_w, bottom))
+    image = image.resize((target_width, target_height), resample=PIL.Image.LANCZOS)
+    if not image.mode == "RGB":
+        image = image.convert("RGB")
+    image = transforms.Compose(
+        [transforms.ToTensor(), transforms.Lambda(lambda x: x * 2.0 - 1.0)]
+    )(image)
+    return image.to(device)
+
+
+if __name__ == "__main__":
+    parser = parse_args()
+    opt, unknown = parser.parse_known_args()
+
+    sample_utils.set_lowvram_mode(opt.low_vram)
+    version_dict = VERSION2SPECS[opt.version]
+    model = sample_utils.init_model(version_dict)
+    unique_keys = set([x.input_key for x in model.conditioner.embedders])
+
+    sample_index = 0
+    while sample_index >= 0:
+        seed_everything(opt.seed)
+
+        frame_list, sample_index, dataset_length, action_dict = get_sample(
+            sample_index, opt.dataset, opt.n_frames, opt.action
+        )
+
+        img_seq = list()
+        for each_path in frame_list:
+            img = load_img(each_path, opt.height, opt.width)
+            img_seq.append(img)
+        images = torch.stack(img_seq)
+
+        value_dict = sample_utils.init_embedder_options(unique_keys)
+        cond_img = img_seq[0][None]
+        value_dict["cond_frames_without_noise"] = cond_img
+        value_dict["cond_aug"] = opt.cond_aug
+        value_dict["cond_frames"] = cond_img + opt.cond_aug * torch.randn_like(cond_img)
+        if action_dict is not None:
+            for key, value in action_dict.items():
+                value_dict[key] = value
+
+        if opt.n_rounds > 1:
+            guider = "TrianglePredictionGuider"
+        else:
+            guider = "VanillaCFG"
+        sampler = sample_utils.init_sampling(
+            guider=guider,
+            steps=opt.n_steps,
+            cfg_scale=opt.cfg_scale,
+            num_frames=opt.n_frames,
+        )
+
+        uc_keys = [
+            "cond_frames",
+            "cond_frames_without_noise",
+            "command",
+            "trajectory",
+            "speed",
+            "angle",
+            "goal",
+        ]
+
+        out = sample_utils.do_sample(
+            images,
+            model,
+            sampler,
+            value_dict,
+            num_rounds=opt.n_rounds,
+            num_frames=opt.n_frames,
+            force_uc_zero_embeddings=uc_keys,
+            initial_cond_indices=[index for index in range(opt.n_conds)],
+        )
+
+        if isinstance(out, (tuple, list)):
+            samples, samples_z, inputs = out
+            virtual_path = os.path.join(opt.save, "virtual")
+            real_path = os.path.join(opt.save, "real")
+            sample_utils.perform_save_locally(
+                virtual_path, samples, "videos", opt.dataset, sample_index
+            )
+            sample_utils.perform_save_locally(
+                virtual_path, samples, "grids", opt.dataset, sample_index
+            )
+            sample_utils.perform_save_locally(
+                virtual_path, samples, "images", opt.dataset, sample_index
+            )
+            sample_utils.perform_save_locally(
+                real_path, inputs, "videos", opt.dataset, sample_index
+            )
+            sample_utils.perform_save_locally(
+                real_path, inputs, "grids", opt.dataset, sample_index
+            )
+            sample_utils.perform_save_locally(
+                real_path, inputs, "images", opt.dataset, sample_index
+            )
+        else:
+            raise TypeError
+
+        if opt.rand_gen:
+            sample_index += random.randint(1, dataset_length - 1)
+        else:
+            sample_index += 1
+            if dataset_length <= sample_index:
+                sample_index = -1
diff --git a/vista/sample_utils.py b/vista/sample_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8149f9b741adb658e147d01e94f2e5cffef3b4ed
--- /dev/null
+++ b/vista/sample_utils.py
@@ -0,0 +1,442 @@
+from __future__ import annotations
+
+import math
+import os
+import queue
+from typing import Optional, Union
+
+import numpy as np
+import rerun as rr
+import torch
+import torchvision
+from einops import rearrange, repeat
+from omegaconf import ListConfig, OmegaConf
+from PIL import Image
+from safetensors.torch import load_file as load_safetensors
+from torch import autocast
+from tqdm import tqdm
+
+from .vwm.modules.diffusionmodules.sampling import EulerEDMSampler
+from .vwm.util import default, instantiate_from_config
+
+
+def init_model(version_dict, load_ckpt=True):
+    config = OmegaConf.load(version_dict["config"])
+    model = load_model_from_config(config, version_dict["ckpt"] if load_ckpt else None)
+    return model
+
+
+lowvram_mode = True
+
+
+def set_lowvram_mode(mode):
+    global lowvram_mode
+    lowvram_mode = mode
+
+
+def initial_model_load(model):
+    global lowvram_mode
+    if lowvram_mode:
+        model.model.half()
+    else:
+        model.cuda()
+    return model
+
+
+def load_model(model):
+    model.cuda()
+
+
+def unload_model(model):
+    global lowvram_mode
+    print(lowvram_mode)
+    if lowvram_mode:
+        model.cpu()
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+def load_model_from_config(config, ckpt=None):
+    model = instantiate_from_config(config.model)
+    print(ckpt)
+
+    if ckpt is not None:
+        print(f"Loading model from {ckpt}")
+        if ckpt.endswith("ckpt"):
+            pl_svd = torch.load(ckpt, map_location="cpu")
+            # dict contains:
+            # "epoch", "global_step", "pytorch-lightning_version",
+            # "state_dict", "loops", "callbacks", "optimizer_states", "lr_schedulers"
+            if "global_step" in pl_svd:
+                print(f"Global step: {pl_svd['global_step']}")
+            svd = pl_svd["state_dict"]
+        else:
+            svd = load_safetensors(ckpt)
+
+        missing, unexpected = model.load_state_dict(svd, strict=False)
+        if len(missing) > 0:
+            print(f"Missing keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected keys: {unexpected}")
+
+    model = initial_model_load(model)
+    model.eval()
+    return model
+
+
+def init_embedder_options(keys):
+    # hardcoded demo settings, might undergo some changes in the future
+    value_dict = dict()
+    for key in keys:
+        if key in ["fps_id", "fps"]:
+            fps = 10
+            value_dict["fps"] = fps
+            value_dict["fps_id"] = fps - 1
+        elif key == "motion_bucket_id":
+            value_dict["motion_bucket_id"] = 127  # [0, 511]
+    return value_dict
+
+
+def perform_save_locally(save_path, samples, mode, dataset_name, sample_index):
+    assert mode in ["images", "grids", "videos"]
+    merged_path = os.path.join(save_path, mode)
+    os.makedirs(merged_path, exist_ok=True)
+    samples = samples.cpu()
+
+    if mode == "images":
+        frame_count = 0
+        for sample in samples:
+            sample = rearrange(sample.numpy(), "c h w -> h w c")
+            if "real" in save_path:
+                sample = 255.0 * (sample + 1.0) / 2.0
+            else:
+                sample = 255.0 * sample
+            image_save_path = os.path.join(
+                merged_path, f"{dataset_name}_{sample_index:06}_{frame_count:04}.png"
+            )
+            # if os.path.exists(image_save_path):
+            #     return
+            Image.fromarray(sample.astype(np.uint8)).save(image_save_path)
+            frame_count += 1
+    elif mode == "grids":
+        grid = torchvision.utils.make_grid(samples, nrow=int(samples.shape[0] ** 0.5))
+        grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1).numpy()
+        if "real" in save_path:
+            grid = 255.0 * (grid + 1.0) / 2.0
+        else:
+            grid = 255.0 * grid
+        grid_save_path = os.path.join(
+            merged_path, f"{dataset_name}_{sample_index:06}.png"
+        )
+        # if os.path.exists(grid_save_path):
+        #     return
+        Image.fromarray(grid.astype(np.uint8)).save(grid_save_path)
+    elif mode == "videos":
+        img_seq = rearrange(samples.numpy(), "t c h w -> t h w c")
+        if "real" in save_path:
+            img_seq = 255.0 * (img_seq + 1.0) / 2.0
+        else:
+            img_seq = 255.0 * img_seq
+        video_save_path = os.path.join(
+            merged_path, f"{dataset_name}_{sample_index:06}.mp4"
+        )
+        # if os.path.exists(video_save_path):
+        #     return
+        save_img_seq_to_video(video_save_path, img_seq.astype(np.uint8), 10)
+    else:
+        raise NotImplementedError
+
+
+def init_sampling(
+    sampler="EulerEDMSampler",
+    guider="VanillaCFG",
+    discretization="EDMDiscretization",
+    steps=50,
+    cfg_scale=2.5,
+    num_frames=25,
+):
+    discretization_config = get_discretization(discretization)
+    guider_config = get_guider(guider, cfg_scale, num_frames)
+    sampler = get_sampler(sampler, steps, discretization_config, guider_config)
+    return sampler
+
+
+def get_discretization(discretization):
+    if discretization == "LegacyDDPMDiscretization":
+        discretization_config = {
+            "target": "vista.vwm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization"
+        }
+    elif discretization == "EDMDiscretization":
+        discretization_config = {
+            "target": "vista.vwm.modules.diffusionmodules.discretizer.EDMDiscretization",
+            "params": {"sigma_min": 0.002, "sigma_max": 700.0, "rho": 7.0},
+        }
+    else:
+        raise NotImplementedError
+    return discretization_config
+
+
+def get_guider(guider="LinearPredictionGuider", cfg_scale=2.5, num_frames=25):
+    if guider == "IdentityGuider":
+        guider_config = {
+            "target": "vista.vwm.modules.diffusionmodules.guiders.IdentityGuider"
+        }
+    elif guider == "VanillaCFG":
+        scale = cfg_scale
+
+        guider_config = {
+            "target": "vista.vwm.modules.diffusionmodules.guiders.VanillaCFG",
+            "params": {"scale": scale},
+        }
+    elif guider == "LinearPredictionGuider":
+        max_scale = cfg_scale
+        min_scale = 1.0
+
+        guider_config = {
+            "target": "vista.vwm.modules.diffusionmodules.guiders.LinearPredictionGuider",
+            "params": {
+                "max_scale": max_scale,
+                "min_scale": min_scale,
+                "num_frames": num_frames,
+            },
+        }
+    elif guider == "TrianglePredictionGuider":
+        max_scale = cfg_scale
+        min_scale = 1.0
+
+        guider_config = {
+            "target": "vista.vwm.modules.diffusionmodules.guiders.TrianglePredictionGuider",
+            "params": {
+                "max_scale": max_scale,
+                "min_scale": min_scale,
+                "num_frames": num_frames,
+            },
+        }
+    else:
+        raise NotImplementedError
+    return guider_config
+
+
+def get_sampler(sampler, steps, discretization_config, guider_config):
+    if sampler == "EulerEDMSampler":
+        s_churn = 0.0
+        s_tmin = 0.0
+        s_tmax = 999.0
+        s_noise = 1.0
+
+        sampler = EulerEDMSampler(
+            num_steps=steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=s_churn,
+            s_tmin=s_tmin,
+            s_tmax=s_tmax,
+            s_noise=s_noise,
+            verbose=False,
+        )
+    else:
+        raise ValueError(f"Unknown sampler {sampler}")
+    return sampler
+
+
+def get_batch(keys, value_dict, N: Union[list, ListConfig], device="cuda"):
+    # hardcoded demo setups, might undergo some changes in the future
+    batch = dict()
+    batch_uc = dict()
+
+    for key in keys:
+        if key in value_dict:
+            if key in ["fps", "fps_id", "motion_bucket_id", "cond_aug"]:
+                batch[key] = repeat(
+                    torch.tensor([value_dict[key]]).to(device), "1 -> b", b=math.prod(N)
+                )
+            elif key in ["command", "trajectory", "speed", "angle", "goal"]:
+                batch[key] = repeat(
+                    value_dict[key][None].to(device), "1 ... -> b ...", b=N[0]
+                )
+            elif key in ["cond_frames", "cond_frames_without_noise"]:
+                batch[key] = repeat(value_dict[key], "1 ... -> b ...", b=N[0])
+            else:
+                # batch[key] = value_dict[key]
+                raise NotImplementedError
+
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+
+
+def get_condition(model, value_dict, num_samples, force_uc_zero_embeddings, device):
+    load_model(model.conditioner)
+    batch, batch_uc = get_batch(
+        list(set([x.input_key for x in model.conditioner.embedders])),
+        value_dict,
+        [num_samples],
+    )
+    c, uc = model.conditioner.get_unconditional_conditioning(
+        batch, batch_uc=batch_uc, force_uc_zero_embeddings=force_uc_zero_embeddings
+    )
+    unload_model(model.conditioner)
+
+    for k in c:
+        if isinstance(c[k], torch.Tensor):
+            c[k], uc[k] = map(lambda y: y[k][:num_samples].to(device), (c, uc))
+            if c[k].shape[0] < num_samples:
+                c[k] = c[k][[0]]
+            if uc[k].shape[0] < num_samples:
+                uc[k] = uc[k][[0]]
+    return c, uc
+
+
+def fill_latent(cond, length, cond_indices, device):
+    latent = torch.zeros(length, *cond.shape[1:]).to(device)
+    latent[cond_indices] = cond
+    return latent
+
+
+@torch.no_grad()
+def do_sample(
+    images,
+    model,
+    sampler,
+    value_dict,
+    num_rounds,
+    num_frames,
+    force_uc_zero_embeddings: Optional[list] = None,
+    initial_cond_indices: Optional[list] = None,
+    device="cuda",
+    log_queue: queue.SimpleQueue = None,
+):
+    if initial_cond_indices is None:
+        initial_cond_indices = [0]
+
+    force_uc_zero_embeddings = default(force_uc_zero_embeddings, list())
+    precision_scope = autocast
+
+    with torch.no_grad(), precision_scope(device), model.ema_scope("Sampling"):
+        c, uc = get_condition(
+            model, value_dict, num_frames, force_uc_zero_embeddings, device
+        )
+
+        load_model(model.first_stage_model)
+        z = model.encode_first_stage(images)
+        unload_model(model.first_stage_model)
+
+        samples_z = torch.zeros((num_rounds * (num_frames - 3) + 3, *z.shape[1:])).to(
+            device
+        )
+
+        sampling_progress = tqdm(total=num_rounds, desc="Compute sequences")
+
+        def denoiser(x, sigma, cond, cond_mask):
+            return model.denoiser(model.model, x, sigma, cond, cond_mask)
+
+        load_model(model.denoiser)
+        load_model(model.model)
+
+        initial_cond_mask = torch.zeros(num_frames).to(device)
+        prediction_cond_mask = torch.zeros(num_frames).to(device)
+        initial_cond_mask[initial_cond_indices] = 1
+        prediction_cond_mask[[0, 1, 2]] = 1
+
+        generated_images = []
+
+        noise = torch.randn_like(z)
+        sample = sampler(
+            denoiser,
+            noise,
+            cond=c,
+            uc=uc,
+            cond_frame=z,  # cond_frame will be rescaled when calling the sampler
+            cond_mask=initial_cond_mask,
+            num_sequence=0,
+            log_queue=log_queue,
+        )
+        sampling_progress.update(1)
+        sample[0] = z[0]
+        samples_z[:num_frames] = sample
+
+        generated_images.append(decode_samples(sample[:num_frames], model))
+
+        for i, generated_image in enumerate(generated_images[-1]):
+            log_queue.put(
+                (
+                    "generated_image",
+                    rr.Image(generated_image.cpu().permute(1, 2, 0)),
+                    [
+                        ("frame_id", i),
+                        ("diffusion", 0),
+                        (
+                            "combined",
+                            1 + 2 * 0 + (i * 1.0 / len(generated_images[-1])),
+                        ),
+                    ],
+                )
+            )
+
+        for n in range(num_rounds - 1):
+            load_model(model.first_stage_model)
+            samples_x_for_guidance = model.decode_first_stage(sample[-14:])
+            unload_model(model.first_stage_model)
+            value_dict["cond_frames_without_noise"] = samples_x_for_guidance[[-3]]
+            value_dict["cond_frames"] = sample[[-3]] / model.scale_factor
+
+            for embedder in model.conditioner.embedders:
+                if hasattr(embedder, "skip_encode"):
+                    embedder.skip_encode = True
+            c, uc = get_condition(
+                model, value_dict, num_frames, force_uc_zero_embeddings, device
+            )
+            for embedder in model.conditioner.embedders:
+                if hasattr(embedder, "skip_encode"):
+                    embedder.skip_encode = False
+
+            filled_latent = fill_latent(sample[-3:], num_frames, [0, 1, 2], device)
+
+            noise = torch.randn_like(filled_latent)
+            sample = sampler(
+                denoiser,
+                noise,
+                cond=c,
+                uc=uc,
+                cond_frame=filled_latent,  # cond_frame will be rescaled when calling the sampler
+                cond_mask=prediction_cond_mask,
+                num_sequence=n + 1,
+                log_queue=log_queue,
+            )
+            sampling_progress.update(1)
+            first_frame_id = (n + 1) * (num_frames - 3) + 3
+            last_frame_id = (n + 1) * (num_frames - 3) + num_frames
+            samples_z[first_frame_id:last_frame_id] = sample[3:]
+
+            generated_images.append(decode_samples(sample[3:], model))
+
+            for i, generated_image in enumerate(generated_images[-1]):
+                log_queue.put(
+                    (
+                        "generated_image",
+                        rr.Image(generated_image.cpu().permute(1, 2, 0)),
+                        [
+                            ("frame_id", first_frame_id + i),
+                            ("diffusion", 0),
+                            (
+                                "combined",
+                                1 + 2 * (n + 1) + (i * 1.0 / len(generated_images[-1])),
+                            ),
+                        ],
+                    )
+                )
+
+        unload_model(model.model)
+        unload_model(model.denoiser)
+
+        generated_images = torch.concat(generated_images, dim=0)
+        return generated_images, samples_z, images
+
+
+def decode_samples(samples, model):
+    load_model(model.first_stage_model)
+    samples_x = model.decode_first_stage(samples)
+    unload_model(model.first_stage_model)
+    samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+    return samples
diff --git a/vista/train.py b/vista/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a5bb2d69f1ea2c5cc915c281a2f6f8a80b74ece
--- /dev/null
+++ b/vista/train.py
@@ -0,0 +1,924 @@
+from __future__ import annotations
+
+import argparse
+import datetime
+import glob
+import inspect
+import os
+import sys
+from inspect import Parameter
+
+import imageio
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torchvision
+from einops import rearrange
+from matplotlib import pyplot as plt
+from natsort import natsorted
+from omegaconf import OmegaConf
+from packaging import version
+from PIL import Image
+from pytorch_lightning import seed_everything
+from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.trainer import Trainer
+from pytorch_lightning.utilities import rank_zero_only
+from safetensors.torch import load_file as load_safetensors
+
+from .vwm.util import instantiate_from_config, isheatmap
+
+MULTINODE_HACKS = True
+
+
+def default_trainer_args():
+    argspec = dict(inspect.signature(Trainer.__init__).parameters)
+    argspec.pop("self")
+    default_args = {
+        param: argspec[param].default
+        for param in argspec
+        if argspec[param] != Parameter.empty
+    }
+    return default_args
+
+
+def get_parser(**parser_kwargs):
+    def str2bool(v):
+        if isinstance(v, bool):
+            return v
+        if v.lower() in ("yes", "true", "t", "y", "1"):
+            return True
+        elif v.lower() in ("no", "false", "f", "n", "0"):
+            return False
+        else:
+            raise argparse.ArgumentTypeError("Boolean value expected")
+
+    parser = argparse.ArgumentParser(**parser_kwargs)
+    parser.add_argument(
+        "-n",
+        "--name",
+        type=str,
+        const=True,
+        default="",
+        nargs="?",
+        help="postfix for logdir"
+    )
+    parser.add_argument(
+        "--no_date",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="if True, skip date generation for logdir and only use naming via opt.base or opt.name (+ opt.postfix, optionally)"
+    )
+    parser.add_argument(
+        "-r",
+        "--resume",
+        type=str,
+        const=True,
+        default="",
+        nargs="?",
+        help="resume from logdir or checkpoint in logdir"
+    )
+    parser.add_argument(
+        "-b",
+        "--base",
+        nargs="*",
+        metavar="base_config.yaml",
+        help="paths to base configs. "
+             "Loaded from left-to-right. "
+             "Parameters can be overwritten or added with command-line options of the form `--key value`",
+        default=list()
+    )
+    parser.add_argument(
+        "-t",
+        "--train",
+        type=str2bool,
+        const=True,
+        default=True,
+        nargs="?",
+        help="train"
+    )
+    parser.add_argument(
+        "--no_test",
+        type=str2bool,
+        const=True,
+        default=True,
+        nargs="?",
+        help="disable test"
+    )
+    parser.add_argument(
+        "-p",
+        "--project",
+        help="name of new or path to existing project"
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="enable post-mortem debugging"
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=23,
+        help="seed for seed_everything"
+    )
+    parser.add_argument(
+        "-f",
+        "--postfix",
+        type=str,
+        default="",
+        help="post-postfix for default name"
+    )
+    parser.add_argument(
+        "-l",
+        "--logdir",
+        type=str,
+        default="logs",
+        help="directory for logging data"
+    )
+    parser.add_argument(
+        "--scale_lr",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="scale base-lr by ngpu * batch_size * n_accumulate"
+    )
+    parser.add_argument(
+        "--legacy_naming",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="name run based on config file name if true, else by whole path"
+    )
+    parser.add_argument(
+        "--enable_tf32",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="enables the TensorFloat32 format both for matmuls and cuDNN for pytorch 1.12"
+    )
+    parser.add_argument(
+        "--no_base_name",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="no config name"
+    )
+    if version.parse(pl.__version__) >= version.parse("2.0.0"):
+        parser.add_argument(
+            "--resume_from_checkpoint",
+            type=str,
+            default=None,
+            help="single checkpoint file to resume from"
+        )
+    parser.add_argument(
+        "--n_devices",
+        type=int,
+        default=8,
+        help="number of gpus in training"
+    )
+    parser.add_argument(
+        "--finetune",
+        type=str,
+        default="ckpts/pytorch_model.bin",
+        help="path to checkpoint to finetune from"
+    )
+    default_args = default_trainer_args()
+    for key in default_args:
+        parser.add_argument("--" + key, default=default_args[key])
+    return parser
+
+
+def get_checkpoint_name(logdir):
+    ckpt = os.path.join(logdir, "checkpoints", "last**.ckpt")
+    ckpt = natsorted(glob.glob(ckpt))
+    print("Available last checkpoints:", ckpt)
+    if len(ckpt) > 1:
+        print("Got most recent checkpoint")
+        ckpt = sorted(ckpt, key=lambda x: os.path.getmtime(x))[-1]
+        print(f"Most recent ckpt is {ckpt}")
+        with open(os.path.join(logdir, "most_recent_ckpt.txt"), "w") as f:
+            f.write(ckpt + "\n")
+        try:
+            version = int(ckpt.split("/")[-1].split("-v")[-1].split(".")[0])
+        except Exception as e:
+            # version confusion but not bad
+            print(e)
+            version = 1
+        # version = last_version + 1
+    else:
+        # in this case, we only have one "last.ckpt"
+        ckpt = ckpt[0]
+        version = 1
+    melk_ckpt_name = f"last-v{version}.ckpt"
+    print(f"Current melk ckpt name: {melk_ckpt_name}")
+    return ckpt, melk_ckpt_name
+
+
+def save_img_seq_to_video(out_path, img_seq, fps):
+    # img_seq: np array
+    writer = imageio.get_writer(out_path, fps=fps)
+    for img in img_seq:
+        writer.append_data(img)
+    writer.close()
+
+
+class SetupCallback(Callback):
+    def __init__(
+            self,
+            resume,
+            now,
+            logdir,
+            ckptdir,
+            cfgdir,
+            config,
+            lightning_config,
+            debug,
+            ckpt_name=None
+    ):
+        super().__init__()
+        self.resume = resume
+        self.now = now
+        self.logdir = logdir
+        self.ckptdir = ckptdir
+        self.cfgdir = cfgdir
+        self.config = config
+        self.lightning_config = lightning_config
+        self.debug = debug
+        self.ckpt_name = ckpt_name
+
+    def on_exception(self, trainer: pl.Trainer, pl_module, exception):
+        if not self.debug and trainer.global_rank == 0:
+            # print("Summoning checkpoint")
+            # if self.ckpt_name is None:
+            #     ckpt_path = os.path.join(self.ckptdir, "last.ckpt")
+            # else:
+            #     ckpt_path = os.path.join(self.ckptdir, self.ckpt_name)
+            # trainer.save_checkpoint(ckpt_path)
+            print("Exiting")
+
+    def on_fit_start(self, trainer, pl_module):
+        if trainer.global_rank == 0:
+            # create logdirs and save configs
+            os.makedirs(self.logdir, exist_ok=True)
+            os.makedirs(self.ckptdir, exist_ok=True)
+            os.makedirs(self.cfgdir, exist_ok=True)
+
+            if "callbacks" in self.lightning_config:
+                if "metrics_over_trainsteps_checkpoint" in self.lightning_config["callbacks"]:
+                    os.makedirs(
+                        os.path.join(self.ckptdir, "trainstep_checkpoints"),
+                        exist_ok=True
+                    )
+            print("Project config")
+            print(OmegaConf.to_yaml(self.config))
+            if MULTINODE_HACKS:
+                import time
+
+                time.sleep(5)
+            OmegaConf.save(
+                self.config,
+                os.path.join(self.cfgdir, f"{self.now}-project.yaml")
+            )
+
+            print("Lightning config")
+            print(OmegaConf.to_yaml(self.lightning_config))
+            OmegaConf.save(
+                OmegaConf.create({"lightning": self.lightning_config}),
+                os.path.join(self.cfgdir, f"{self.now}-lightning.yaml")
+            )
+        else:
+            # ModelCheckpoint callback created log directory, remove it
+            if not MULTINODE_HACKS and not self.resume and os.path.exists(self.logdir):
+                dst, name = os.path.split(self.logdir)
+                dst = os.path.join(dst, "child_runs", name)
+                os.makedirs(os.path.split(dst)[0], exist_ok=True)
+                try:
+                    os.rename(self.logdir, dst)
+                except FileNotFoundError:
+                    pass
+
+
+class ImageLogger(Callback):
+    def __init__(
+            self,
+            batch_frequency,
+            clamp=True,
+            increase_log_steps=True,
+            rescale=True,
+            disabled=False,
+            log_on_batch_idx=False,
+            log_first_step=False,
+            log_images_kwargs=None,
+            log_before_first_step=False,
+            enable_autocast=True,
+            num_frames=25
+    ):
+        super().__init__()
+        self.enable_autocast = enable_autocast
+        self.rescale = rescale
+        self.batch_freq = batch_frequency
+        self.log_steps = [2 ** n for n in range(int(np.log2(self.batch_freq)) + 1)]
+        if not increase_log_steps:
+            self.log_steps = [self.batch_freq]
+        self.clamp = clamp
+        self.disabled = disabled
+        self.log_on_batch_idx = log_on_batch_idx
+        self.log_images_kwargs = log_images_kwargs if log_images_kwargs else dict()
+        self.log_first_step = log_first_step
+        self.log_before_first_step = log_before_first_step
+        self.num_frames = num_frames
+
+    @rank_zero_only
+    def log_local(
+            self,
+            save_dir,
+            split,
+            images,
+            global_step,
+            current_epoch,
+            batch_idx
+    ):
+        root = os.path.join(save_dir, "images", split)
+        for log_type in images:
+            if isheatmap(images[log_type]):
+                _fig, ax = plt.subplots()
+                ax = ax.matshow(
+                    images[log_type].cpu().numpy(), cmap="hot", interpolation="lanczos"
+                )
+                plt.colorbar(ax)
+                plt.axis("off")
+
+                filename = f"{log_type}_epoch{current_epoch:03}_batch{batch_idx:06}_step{global_step:06}.png"
+                os.makedirs(root, exist_ok=True)
+                path = os.path.join(root, log_type, filename)
+                plt.savefig(path)
+                plt.close()
+            elif "mp4" in log_type:
+                dir_path = os.path.join(root, log_type)
+                os.makedirs(dir_path, exist_ok=True)
+                img_seq = images[log_type]
+                if self.rescale:
+                    img_seq = (img_seq + 1.0) / 2.0
+                img_seq = rearrange(img_seq, "(b t) c h w -> b t h w c", t=self.num_frames)
+                B, _T = img_seq.shape[:2]
+                for b_i in range(B):
+                    cur_img_seq = img_seq[b_i].numpy()  # [t h w c]
+                    cur_img_seq = (cur_img_seq * 255).astype(np.uint8)  # [t h w c]
+                    filename = f"{log_type}_epoch{current_epoch:02}_batch{batch_idx:04}_step{global_step:06}.mp4"
+                    save_img_seq_to_video(os.path.join(root, log_type, filename), cur_img_seq, fps=10)
+            else:
+                grid = torchvision.utils.make_grid(images[log_type], nrow=int(images[log_type].shape[0] ** 0.5))
+                if self.rescale:
+                    grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+                grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
+                grid = grid.numpy()
+                grid = (grid * 255).astype(np.uint8)
+                filename = f"{log_type}_epoch{current_epoch:02}_batch{batch_idx:04}_step{global_step:06}.png"
+                dir_path = os.path.join(root, log_type)
+                os.makedirs(dir_path, exist_ok=True)
+                path = os.path.join(dir_path, filename)
+                img = Image.fromarray(grid)
+                img.save(path)
+
+    @rank_zero_only
+    def log_img(self, pl_module, batch, batch_idx, split="train"):
+        check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step
+        if (
+                self.check_frequency(check_idx)
+                and hasattr(pl_module, "log_images")  # batch_idx % self.batch_freq == 0
+                and callable(pl_module.log_images)
+        ) or split == "test":
+            is_train = pl_module.training
+            if is_train:
+                pl_module.eval()
+
+            gpu_autocast_kwargs = {
+                "enabled": self.enable_autocast,  # torch.is_autocast_enabled(),
+                "dtype": torch.get_autocast_gpu_dtype(),
+                "cache_enabled": torch.is_autocast_cache_enabled()
+            }
+
+            with torch.no_grad(), torch.cuda.amp.autocast(**gpu_autocast_kwargs):
+                images = pl_module.log_images(batch, split=split, **self.log_images_kwargs)
+
+            for log_type in images:
+                if isinstance(images[log_type], torch.Tensor):
+                    images[log_type] = images[log_type].detach().float().cpu()
+                    if self.clamp and not isheatmap(images[log_type]):
+                        images[log_type] = torch.clamp(images[log_type], -1.0, 1.0)
+
+            self.log_local(
+                pl_module.logger.save_dir,
+                split,
+                images,
+                pl_module.global_step,
+                pl_module.current_epoch,
+                batch_idx
+            )
+
+            if is_train:
+                pl_module.train()
+
+    def check_frequency(self, check_idx):
+        if (check_idx % self.batch_freq == 0 or check_idx in self.log_steps) and (check_idx > 0 or self.log_first_step):
+            try:
+                self.log_steps.pop(0)
+            except IndexError as e:
+                print(e)
+                pass
+            return True
+        else:
+            return False
+
+    @rank_zero_only
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        if not self.disabled and (pl_module.global_step > 0 or self.log_first_step):
+            self.log_img(pl_module, batch, batch_idx, split="train")
+
+    @rank_zero_only
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
+        if self.log_before_first_step and pl_module.global_step == 0:
+            print(f"{self.__class__.__name__}: logging before training")
+            self.log_img(pl_module, batch, batch_idx, split="train")
+
+    @rank_zero_only
+    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, *args, **kwargs):
+        if not self.disabled and pl_module.global_step > 0:
+            self.log_img(pl_module, batch, batch_idx, split="val")
+
+    @rank_zero_only
+    def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self.log_img(pl_module, batch, batch_idx, split="test")
+
+
+if __name__ == "__main__":
+    # custom parser to specify config files, train, test and debug mode, postfix, resume
+    # `--key value` arguments are interpreted as arguments to the trainer
+    # `nested.key=value` arguments are interpreted as config parameters
+    # configs are merged from left-to-right followed by command line parameters
+
+    # model:
+    #   base_learning_rate: float
+    #   target: path to lightning module
+    #   params:
+    #       key: value
+    # data:
+    #   target: train.DataModuleFromConfig
+    #   params:
+    #      batch_size: int
+    #      wrap: bool
+    #      train:
+    #          target: path to train dataset
+    #          params:
+    #              key: value
+    #      validation:
+    #          target: path to validation dataset
+    #          params:
+    #              key: value
+    #      test:
+    #          target: path to test dataset
+    #          params:
+    #              key: value
+    # lightning: (optional, has sane defaults and can be specified on cmd line)
+    #   trainer:
+    #       additional arguments to trainer
+    #   logger:
+    #       logger to instantiate
+    #   modelcheckpoint:
+    #       modelcheckpoint to instantiate
+    #   callbacks:
+    #       callback1:
+    #           target: importpath
+    #           params:
+    #               key: value
+
+    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+
+    # add cwd for convenience and to make classes in this file available when
+    # running as `python train.py`
+    # (in particular `train.DataModuleFromConfig`)
+    sys.path.append(os.getcwd())
+
+    parser = get_parser()
+    opt, unknown = parser.parse_known_args()
+
+    if opt.name and opt.resume:
+        raise ValueError(
+            "-n/--name and -r/--resume cannot be specified both. "
+            "If you want to resume training in a new log folder, "
+            "use -n/--name in combination with --resume_from_checkpoint"
+        )
+    melk_ckpt_name = None
+    name = None
+    if opt.resume:
+        if not os.path.exists(opt.resume):
+            raise ValueError(f"Cannot find {opt.resume}")
+        if os.path.isfile(opt.resume):
+            paths = opt.resume.split("/")
+            # idx = len(paths)-paths[::-1].index("logs")+1
+            # logdir = "/".join(paths[:idx])
+            logdir = "/".join(paths[:-2])
+            ckpt = opt.resume
+            _, melk_ckpt_name = get_checkpoint_name(logdir)
+        else:
+            assert os.path.isdir(opt.resume), opt.resume
+            logdir = opt.resume.rstrip("/")
+            ckpt, melk_ckpt_name = get_checkpoint_name(logdir)
+
+        print("#" * 100)
+        print(f"Resuming from checkpoint `{ckpt}`")
+        print("#" * 100)
+
+        opt.resume_from_checkpoint = ckpt
+        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml")))
+        opt.base = base_configs + opt.base
+        _tmp = logdir.split("/")
+        nowname = _tmp[-1]
+    else:
+        if opt.name:
+            name = "_" + opt.name
+        elif opt.base:
+            if opt.no_base_name:
+                name = ""
+            else:
+                if opt.legacy_naming:
+                    cfg_fname = os.path.split(opt.base[0])[-1]
+                    cfg_name = os.path.splitext(cfg_fname)[0]
+                else:
+                    assert "configs" in os.path.split(opt.base[0])[0], os.path.split(
+                        opt.base[0]
+                    )[0]
+                    cfg_path = os.path.split(opt.base[0])[0].split(os.sep)[
+                               os.path.split(opt.base[0])[0].split(os.sep).index("configs")
+                               + 1:
+                               ]  # cut away the first one (we assert all configs are in "configs")
+                    cfg_name = os.path.splitext(os.path.split(opt.base[0])[-1])[0]
+                    cfg_name = "-".join(cfg_path) + f"-{cfg_name}"
+                name = "_" + cfg_name
+        else:
+            name = ""
+        if opt.no_date:
+            nowname = name + opt.postfix
+            if nowname.startswith("_"):
+                nowname = nowname[1:]
+        else:
+            nowname = now + name + opt.postfix
+        logdir = os.path.join(opt.logdir, nowname)
+
+    ckptdir = os.path.join(logdir, "checkpoints")
+    cfgdir = os.path.join(logdir, "configs")
+    seed_everything(opt.seed, workers=True)
+
+    # move before model init, in case a torch.compile(...) is called somewhere
+    if opt.enable_tf32:
+        # pt_version = version.parse(torch.__version__)
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        print(f"Enabling TF32 for PyTorch {torch.__version__}")
+    else:
+        print(f"Using default TF32 settings for PyTorch {torch.__version__}:")
+        print(f"torch.backends.cuda.matmul.allow_tf32={torch.backends.cuda.matmul.allow_tf32}")
+        print(f"torch.backends.cudnn.allow_tf32={torch.backends.cudnn.allow_tf32}")
+
+    try:
+        # init and save configs
+        configs = [OmegaConf.load(cfg) for cfg in opt.base]
+        cli = OmegaConf.from_dotlist(unknown)
+        config = OmegaConf.merge(*configs, cli)
+        lightning_config = config.pop("lightning", OmegaConf.create())
+        # merge trainer cli with config
+        trainer_config = lightning_config.get("trainer", OmegaConf.create())
+
+        # default to gpu
+        trainer_config["accelerator"] = "gpu"
+
+        standard_args = default_trainer_args()
+        for k in standard_args:
+            if getattr(opt, k) != standard_args[k]:
+                trainer_config[k] = getattr(opt, k)
+
+        n_devices = getattr(opt, "n_devices", None)
+        if n_devices is not None:
+            assert isinstance(n_devices, int) and n_devices > 0
+            devices = [str(i) for i in range(n_devices)]
+            trainer_config["devices"] = ",".join(devices) + ","
+        else:
+            assert "devices" in trainer_config, "Must specify either n_devices or devices"
+
+        ckpt_resume_path = opt.resume_from_checkpoint
+
+        if "devices" not in trainer_config and trainer_config["accelerator"] != "gpu":
+            del trainer_config["accelerator"]
+            cpu = True
+        else:
+            gpuinfo = trainer_config["devices"]
+            print(f"Running on GPUs {gpuinfo}")
+            cpu = False
+        trainer_opt = argparse.Namespace(**trainer_config)
+        lightning_config.trainer = trainer_config
+
+        # model
+        model = instantiate_from_config(config.model)
+
+        # use pretrained model
+        if not opt.resume or opt.finetune:
+            if not opt.finetune or not os.path.exists(opt.finetune):
+                default_ckpt = "ckpts/svd_xt.safetensors"
+                print(f"Loading pretrained model from {default_ckpt}")
+                svd = load_safetensors(default_ckpt)
+                for k in list(svd.keys()):
+                    if "time_embed" in k:  # duplicate a new timestep embedding from the pretrained weights
+                        svd[k.replace("time_embed", "cond_time_stack_embed")] = svd[k]
+            else:
+                ckpt_path = opt.finetune
+                print(f"Loading pretrained model from {ckpt_path}")
+                if ckpt_path.endswith("ckpt"):
+                    svd = torch.load(ckpt_path, map_location="cpu")["state_dict"]
+                elif ckpt_path.endswith("bin"):  # for deepspeed merged checkpoints
+                    svd = torch.load(ckpt_path, map_location="cpu")
+                    for k in list(svd.keys()):  # remove the prefix
+                        if "_forward_module" in k:
+                            svd[k.replace("_forward_module.", "")] = svd[k]
+                        del svd[k]
+                elif ckpt_path.endswith("safetensors"):
+                    svd = load_safetensors(ckpt_path)
+                else:
+                    raise NotImplementedError
+            missing, unexpected = model.load_state_dict(svd, strict=False)
+
+            # avoid empty weights when resuming from EMA weights
+            for miss_k in missing:
+                ema_name = miss_k.replace(".", "").replace("modeldiffusion_model", "model_ema.diffusion_model")
+                svd[miss_k] = svd[ema_name]
+                print("Fill", miss_k, "with", ema_name)
+            missing, unexpected = model.load_state_dict(svd, strict=False)
+
+            if len(missing) > 0:
+                if not opt.finetune or not os.path.exists(opt.finetune):
+                    model.reinit_ema()
+                missing = [model_key for model_key in missing if "model_ema" not in model_key]
+                # print(f"Missing keys: {missing}")
+            print(f"Missing keys: {missing}")
+            # if len(unexpected) > 0:
+            #     print(f"Unexpected keys: {unexpected}")
+            print(f"Unexpected keys: {unexpected}")
+
+        # trainer and callbacks
+        trainer_kwargs = dict()
+
+        # default logger configs
+        default_logger_cfgs = {
+            "csv": {
+                "target": "pytorch_lightning.loggers.CSVLogger",
+                "params": {
+                    "name": "testtube",  # hack for sbord fanatics
+                    "save_dir": logdir
+                }
+            }
+        }
+        default_logger_cfg = default_logger_cfgs["csv"]
+        if "logger" in lightning_config:
+            logger_cfg = lightning_config.logger
+        else:
+            logger_cfg = OmegaConf.create()
+        logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg)
+        trainer_kwargs["logger"] = instantiate_from_config(logger_cfg)
+
+        # use TrainResult/EvalResult(checkpoint_on=metric) to specify which metric is used to determine best models
+        default_modelckpt_cfg = {
+            "target": "pytorch_lightning.callbacks.ModelCheckpoint",
+            "params": {
+                "dirpath": ckptdir,
+                "filename": "{epoch:02}",
+                "verbose": True,
+                "save_last": True,
+                "save_top_k": -1
+            }
+        }
+        # if hasattr(model, "monitor"):
+        #     print(f"Monitoring {model.monitor} as checkpoint metric")
+        #     default_modelckpt_cfg["params"]["monitor"] = model.monitor
+        #     default_modelckpt_cfg["params"]["save_top_k"] = 3
+
+        if "modelcheckpoint" in lightning_config:
+            modelckpt_cfg = lightning_config.modelcheckpoint
+        else:
+            modelckpt_cfg = OmegaConf.create()
+        modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
+        print(f"Merged modelckpt-cfg: \n{modelckpt_cfg}")
+
+        # default to ddp if not further specified
+        default_strategy_config = {"target": "pytorch_lightning.strategies.DDPStrategy"}
+
+        if "strategy" in lightning_config:
+            strategy_cfg = lightning_config.strategy
+        else:
+            strategy_cfg = OmegaConf.create()
+            default_strategy_config["params"] = {
+                "find_unused_parameters": True
+            }
+        strategy_cfg = OmegaConf.merge(default_strategy_config, strategy_cfg)
+        print(
+            f"strategy config: \n ++++++++++++++ \n {strategy_cfg} \n ++++++++++++++ "
+        )
+        trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg)
+
+        # add callback which sets up log directory
+        default_callbacks_cfg = {
+            "setup_callback": {
+                "target": "train.SetupCallback",
+                "params": {
+                    "resume": opt.resume,
+                    "now": now,
+                    "logdir": logdir,
+                    "ckptdir": ckptdir,
+                    "cfgdir": cfgdir,
+                    "config": config,
+                    "lightning_config": lightning_config,
+                    "debug": opt.debug,
+                    "ckpt_name": melk_ckpt_name
+                }
+            },
+            "image_logger": {
+                "target": "train.ImageLogger",
+                "params": {
+                    "batch_frequency": 1000,
+                    "clamp": True
+                }
+            },
+            "learning_rate_logger": {
+                "target": "pytorch_lightning.callbacks.LearningRateMonitor",
+                "params": {
+                    "logging_interval": "step"
+                }
+            }
+        }
+        if version.parse(pl.__version__) >= version.parse("1.4.0"):
+            default_callbacks_cfg.update({"checkpoint_callback": modelckpt_cfg})
+
+        if "callbacks" in lightning_config:
+            callbacks_cfg = lightning_config.callbacks
+        else:
+            callbacks_cfg = OmegaConf.create()
+
+        # if "metrics_over_trainsteps_checkpoint" in callbacks_cfg:
+        #     print(
+        #         "WARNING: saving checkpoints every n train steps without deleting, this might require some free space"
+        #     )
+        #     default_metrics_over_trainsteps_ckpt_dict = {
+        #         "metrics_over_trainsteps_checkpoint": {
+        #             "target": "pytorch_lightning.callbacks.ModelCheckpoint",
+        #             "params": {
+        #                 "dirpath": os.path.join(ckptdir, "trainstep_checkpoints"),
+        #                 "filename": "{epoch:06}-{step:09}",
+        #                 "verbose": True,
+        #                 "save_top_k": -1,
+        #                 "every_n_train_steps": 10000,
+        #                 "save_weights_only": True
+        #             }
+        #         }
+        #     }
+        #     default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict)
+
+        callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
+        if "ignore_keys_callback" in callbacks_cfg and ckpt_resume_path is not None:
+            callbacks_cfg.ignore_keys_callback.params["ckpt_path"] = ckpt_resume_path
+        elif "ignore_keys_callback" in callbacks_cfg:
+            del callbacks_cfg["ignore_keys_callback"]
+
+        trainer_kwargs["callbacks"] = [
+            instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg
+        ]
+        if "plugins" not in trainer_kwargs:
+            trainer_kwargs["plugins"] = list()
+
+        # cmd line trainer args (which are in trainer_opt) have always priority over
+        # config-trainer-args (which are in trainer_kwargs)
+        trainer_opt = vars(trainer_opt)
+        trainer_kwargs = {
+            key: val for key, val in trainer_kwargs.items() if key not in trainer_opt
+        }
+        trainer = Trainer(**trainer_opt, **trainer_kwargs)
+
+        trainer.logdir = logdir
+
+        # data
+        data = instantiate_from_config(config.data)
+        # calling these ourselves should not be necessary, but it is
+        # lightning still takes care of proper multiprocessing though
+        data.prepare_data()
+        # data.setup()
+        print("#### Data #####")
+        try:
+            for k in data.datasets:
+                print(
+                    f"{k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}"
+                )
+        except:
+            print("Datasets not yet initialized")
+
+        # configure learning rate
+        if "batch_size" in config.data.params:
+            bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate
+        else:
+            bs, base_lr = (
+                config.data.params.train.loader.batch_size,
+                config.model.base_learning_rate
+            )
+        if cpu:
+            ngpu = 1
+        else:
+            ngpu = len(lightning_config.trainer.devices.strip(",").split(","))
+        if "accumulate_grad_batches" in lightning_config.trainer:
+            accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches
+        else:
+            accumulate_grad_batches = 1
+        print(f"accumulate_grad_batches = {accumulate_grad_batches}")
+        lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches
+        if opt.scale_lr:
+            model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr
+            print(
+                "Setting learning rate to "
+                f"{model.learning_rate:.2e} = {accumulate_grad_batches} (accumulate_grad_batches) * {ngpu} (num_gpus) * {bs} (batch_size) * {base_lr:.2e} (base_lr)"
+            )
+        else:
+            model.learning_rate = base_lr
+            print("++++ NOT USING LR SCALING ++++")
+            print(f"Setting learning rate to {model.learning_rate:.2e}")
+
+
+        # allow checkpointing via USR1
+        def melk(*args, **kwargs):
+            # run all checkpoint hooks
+            if trainer.global_rank == 0:
+                # print("Summoning checkpoint")
+                # if melk_ckpt_name is None:
+                #     ckpt_path = os.path.join(ckptdir, "last.ckpt")
+                # else:
+                #     ckpt_path = os.path.join(ckptdir, melk_ckpt_name)
+                # trainer.save_checkpoint(ckpt_path)
+                print("Exiting")
+
+
+        def divein(*args, **kwargs):
+            if trainer.global_rank == 0:
+                import pudb
+                pudb.set_trace()
+
+
+        import signal
+
+        signal.signal(signal.SIGUSR1, melk)
+        signal.signal(signal.SIGUSR2, divein)
+
+        # run
+        if opt.train:
+            trainer.fit(model, data, ckpt_path=ckpt_resume_path)
+        if not opt.no_test and not trainer.interrupted:
+            trainer.test(model, data)
+    except RuntimeError as error:
+        # if MULTINODE_HACKS:
+        #     import datetime
+        #     import os
+        #     import socket
+        #
+        #     import requests
+        #
+        #     device = os.environ.get("CUDA_VISIBLE_DEVICES", "?")
+        #     hostname = socket.gethostname()
+        #     ts = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
+        #     resp = requests.get("http://169.254.169.254/latest/meta-data/instance-id")
+        #     print(
+        #         f"ERROR at {ts} "
+        #         f"on {hostname}/{resp.text} (CUDA_VISIBLE_DEVICES={device}): {type(err).__name__}: {err}",
+        #         flush=True
+        #     )
+        raise error
+    except Exception:
+        if opt.debug and trainer.global_rank == 0:
+            try:
+                import pudb as debugger
+            except ImportError:
+                import pdb as debugger
+            debugger.post_mortem()
+        raise
+    finally:
+        # move newly created debug project to debug_runs
+        if opt.debug and not opt.resume and trainer.global_rank == 0:
+            dst, name = os.path.split(logdir)
+            dst = os.path.join(dst, "debug_runs", name)
+            os.makedirs(os.path.split(dst)[0], exist_ok=True)
+            os.rename(logdir, dst)
+
+        # if trainer.global_rank == 0:
+        #    print(trainer.profiler.summary())
diff --git a/vista/vwm/__init__.py b/vista/vwm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4bbe255eb30bdbdc337219d1f1e75fe23156c2b
--- /dev/null
+++ b/vista/vwm/__init__.py
@@ -0,0 +1,6 @@
+from __future__ import annotations
+
+from .models import AutoencodingEngine, DiffusionEngine
+from .util import get_configs_path, instantiate_from_config
+
+__version__ = "0.1.0"
diff --git a/vista/vwm/data/__init__.py b/vista/vwm/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbdc17d593f266a15d2c8204084144d2ec26fce8
--- /dev/null
+++ b/vista/vwm/data/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from .dataset import StableDataModuleFromConfig
diff --git a/vista/vwm/data/dataset.py b/vista/vwm/data/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..047974a8fbf92978f5b44c73c6e02677f457210e
--- /dev/null
+++ b/vista/vwm/data/dataset.py
@@ -0,0 +1,184 @@
+from __future__ import annotations
+
+import random
+from typing import Optional
+
+import torchdata.datapipes.iter
+import webdataset as wds
+from omegaconf import DictConfig
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import DataLoader, Dataset
+
+from .subsets import NuScenesDataset, YouTubeDataset
+
+try:
+    from sdata import create_dataset, create_dummy_dataset, create_loader
+except ImportError:
+    print("#" * 100)
+    print("Datasets not yet available")
+    print("To enable, we need to add stable-datasets as a submodule")
+    print("Please use ``git submodule update --init --recursive``")
+    print("and do ``pip install -e stable-datasets/`` from the root of this repo")
+    print("#" * 100)
+    exit(1)
+
+
+class StableDataModuleFromConfig(LightningDataModule):
+    def __init__(
+            self,
+            train: DictConfig,
+            validation: Optional[DictConfig] = None,
+            test: Optional[DictConfig] = None,
+            skip_val_loader: bool = False,
+            dummy: bool = False
+    ):
+        super().__init__()
+        self.train_config = train
+        assert (
+                "datapipeline" in self.train_config and "loader" in self.train_config
+        ), "Train config requires the fields `datapipeline` and `loader`"
+
+        self.val_config = validation
+        if not skip_val_loader:
+            if self.val_config is not None:
+                assert (
+                        "datapipeline" in self.val_config and "loader" in self.val_config
+                ), "Validation config requires the fields `datapipeline` and `loader`"
+            else:
+                print(
+                    "WARNING: no validation datapipeline defined, using that one from training"
+                )
+                self.val_config = train
+
+        self.test_config = test
+        if self.test_config is not None:
+            assert (
+                    "datapipeline" in self.test_config and "loader" in self.test_config
+            ), "Test config requires the fields `datapipeline` and `loader`"
+
+        self.dummy = dummy
+        if self.dummy:
+            print("#" * 100)
+            print("Using dummy dataset, hope you are debugging")
+            print("#" * 100)
+
+    def setup(self, stage: str) -> None:
+        print("Preparing datasets")
+        if self.dummy:
+            data_fn = create_dummy_dataset
+        else:
+            data_fn = create_dataset
+
+        self.train_data_pipeline = data_fn(**self.train_config.datapipeline)
+        if self.val_config:
+            self.val_data_pipeline = data_fn(**self.val_config.datapipeline)
+        if self.test_config:
+            self.test_data_pipeline = data_fn(**self.test_config.datapipeline)
+
+    def train_dataloader(self) -> torchdata.datapipes.iter.IterDataPipe:
+        return create_loader(self.train_data_pipeline, **self.train_config.loader)
+
+    def val_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.val_data_pipeline, **self.val_config.loader)
+
+    def test_dataloader(self) -> wds.DataPipeline:
+        return create_loader(self.test_data_pipeline, **self.test_config.loader)
+
+
+def dataset_mapping(subset_list: list, target_height: int, target_width: int, num_frames: int):
+    datasets = list()
+    for subset_name in subset_list:
+        if subset_name == "YouTube":
+            datasets.append(
+                YouTubeDataset(target_height=target_height, target_width=target_width, num_frames=num_frames)
+            )
+        elif subset_name == "NuScenes":
+            datasets.append(
+                NuScenesDataset(target_height=target_height, target_width=target_width, num_frames=num_frames)
+            )
+        else:
+            raise NotImplementedError(f"Please define {subset_name} as a subset")
+    return datasets
+
+
+class MultiSourceSamplerDataset(Dataset):
+    def __init__(self, subsets, probs, samples_per_epoch=1000, target_height=320, target_width=576, num_frames=25):
+        self.subsets = dataset_mapping(subsets, target_height, target_width, num_frames)
+        # if probabilities not provided, sample uniformly from all samples
+        if probs is None:
+            probs = [len(d) for d in self.subsets]
+        # normalize
+        total_prob = sum(probs)
+        self.sample_probs = [x / total_prob for x in probs]
+        self.samples_per_epoch = samples_per_epoch
+
+    def __len__(self):
+        return self.samples_per_epoch
+
+    def __getitem__(self, index):
+        """
+        Args:
+        ----
+            index (int): Index (ignored since we sample randomly).
+
+        Returns:
+        -------
+            TensorDict: Dict containing all the data blocks.
+
+        """
+
+        # randomly select a subset based on weights
+        subset = random.choices(self.subsets, self.sample_probs)[0]
+
+        # sample a valid sample with a random index
+        while True:
+            try:
+                sample_item = random.choice(subset)
+                # return the sampled item
+                return sample_item
+            except:
+                pass
+
+
+class Sampler(LightningDataModule):
+    def __init__(self, batch_size, num_workers=0, prefetch_factor=2, shuffle=True, subsets=None, probs=None,
+                 samples_per_epoch=None, target_height=320, target_width=576, num_frames=25):
+        super().__init__()
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor if num_workers > 0 else 0
+        self.shuffle = shuffle
+        self.train_dataset = MultiSourceSamplerDataset(
+            subsets=subsets, probs=probs, samples_per_epoch=samples_per_epoch,
+            target_height=target_height, target_width=target_width, num_frames=num_frames
+        )
+
+    def prepare_data(self):
+        pass
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self.train_dataset,  # we disable online testing to improve training efficiency
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            prefetch_factor=self.prefetch_factor
+        )
diff --git a/vista/vwm/data/subsets/__init__.py b/vista/vwm/data/subsets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c44611b682c7cb8e249b24ea54e156af5667ffb4
--- /dev/null
+++ b/vista/vwm/data/subsets/__init__.py
@@ -0,0 +1,4 @@
+from __future__ import annotations
+
+from .nuscenes import NuScenesDataset
+from .youtube import YouTubeDataset
diff --git a/vista/vwm/data/subsets/common.py b/vista/vwm/data/subsets/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..1598d24ab5768526f2746f949154cc17c321207a
--- /dev/null
+++ b/vista/vwm/data/subsets/common.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+import json
+
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+
+class BaseDataset(Dataset):
+    def __init__(self, data_root, anno_file, target_height=320, target_width=576, num_frames=25):
+        self.data_root = data_root
+
+        assert target_height % 64 == 0 and target_width % 64 == 0, "Resize to integer multiple of 64"
+        self.img_preprocessor = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Lambda(lambda x: x * 2.0 - 1.0)
+        ])
+
+        if isinstance(anno_file, list):
+            self.samples = list()
+            for each_file in anno_file:
+                with open(each_file) as anno_json:
+                    self.samples += json.load(anno_json)
+        else:
+            with open(anno_file) as anno_json:
+                self.samples = json.load(anno_json)
+
+        self.target_height = target_height
+        self.target_width = target_width
+        self.num_frames = num_frames
+
+        # self.log_cond_aug_dist = torch.distributions.Normal(-3.0, 0.5)
+
+    def preprocess_image(self, image_path):
+        image = Image.open(image_path)
+        ori_w, ori_h = image.size
+        if ori_w / ori_h > self.target_width / self.target_height:
+            tmp_w = int(self.target_width / self.target_height * ori_h)
+            left = (ori_w - tmp_w) // 2
+            right = (ori_w + tmp_w) // 2
+            image = image.crop((left, 0, right, ori_h))
+        elif ori_w / ori_h < self.target_width / self.target_height:
+            tmp_h = int(self.target_height / self.target_width * ori_w)
+            top = (ori_h - tmp_h) // 2
+            bottom = (ori_h + tmp_h) // 2
+            image = image.crop((0, top, ori_w, bottom))
+        image = image.resize((self.target_width, self.target_height), resample=Image.LANCZOS)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image = self.img_preprocessor(image)
+        return image
+
+    def get_image_path(self, sample_dict, current_index):
+        pass
+
+    def build_data_dict(self, image_seq, sample_dict):
+        # log_cond_aug = self.log_cond_aug_dist.sample()
+        # cond_aug = torch.exp(log_cond_aug)
+        cond_aug = torch.tensor([0.0])
+        data_dict = {
+            "img_seq": torch.stack(image_seq),
+            "motion_bucket_id": torch.tensor([127]),
+            "fps_id": torch.tensor([9]),
+            "cond_frames_without_noise": image_seq[0],
+            "cond_frames": image_seq[0] + cond_aug * torch.randn_like(image_seq[0]),
+            "cond_aug": cond_aug
+        }
+        return data_dict
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, index):
+        sample_dict = self.samples[index]
+
+        image_seq = list()
+        for i in range(self.num_frames):
+            current_index = i
+            img_path = self.get_image_path(sample_dict, current_index)
+            image = self.preprocess_image(img_path)
+            image_seq.append(image)
+        return self.build_data_dict(image_seq, sample_dict)
diff --git a/vista/vwm/data/subsets/nuscenes.py b/vista/vwm/data/subsets/nuscenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b21f7bbf44997e12bc1d095114df1cfe277ac9b
--- /dev/null
+++ b/vista/vwm/data/subsets/nuscenes.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+import os
+
+import torch
+
+from .common import BaseDataset
+
+
+def balance_with_actions(samples, increase_factor=5, exceptions=None):
+    if exceptions is None:
+        exceptions = [2, 3]
+    sample_to_add = list()
+    if increase_factor > 1:
+        for each_sample in samples:
+            if each_sample["cmd"] not in exceptions:
+                for _ in range(increase_factor - 1):
+                    sample_to_add.append(each_sample)
+    return samples + sample_to_add
+
+
+def resample_complete_samples(samples, increase_factor=5):
+    sample_to_add = list()
+    if increase_factor > 1:
+        for each_sample in samples:
+            if (each_sample["speed"] and each_sample["angle"] and each_sample["z"] > 0
+                    and 0 < each_sample["goal"][0] < 1600 and 0 < each_sample["goal"][1] < 900):
+                for _ in range(increase_factor - 1):
+                    sample_to_add.append(each_sample)
+    return samples + sample_to_add
+
+
+class NuScenesDataset(BaseDataset):
+    def __init__(self, data_root="data/nuscenes", anno_file="annos/nuScenes.json",
+                 target_height=320, target_width=576, num_frames=25):
+        if not os.path.exists(data_root):
+            raise ValueError(f"Cannot find dataset {data_root}")
+        if not os.path.exists(anno_file):
+            raise ValueError(f"Cannot find annotation {anno_file}")
+        super().__init__(data_root, anno_file, target_height, target_width, num_frames)
+        print("nuScenes loaded:", len(self))
+        self.samples = balance_with_actions(self.samples, increase_factor=5)
+        print("nuScenes balanced:", len(self))
+        self.samples = resample_complete_samples(self.samples, increase_factor=2)
+        print("nuScenes resampled:", len(self))
+        self.action_mod = 0
+
+    def get_image_path(self, sample_dict, current_index):
+        return os.path.join(self.data_root, sample_dict["frames"][current_index])
+
+    def build_data_dict(self, image_seq, sample_dict):
+        # log_cond_aug = self.log_cond_aug_dist.sample()
+        # cond_aug = torch.exp(log_cond_aug)
+        cond_aug = torch.tensor([0.0])
+        data_dict = {
+            "img_seq": torch.stack(image_seq),
+            "motion_bucket_id": torch.tensor([127]),
+            "fps_id": torch.tensor([9]),
+            "cond_frames_without_noise": image_seq[0],
+            "cond_frames": image_seq[0] + cond_aug * torch.randn_like(image_seq[0]),
+            "cond_aug": cond_aug
+        }
+        if self.action_mod == 0:
+            data_dict["trajectory"] = torch.tensor(sample_dict["traj"][2:])
+        elif self.action_mod == 1:
+            data_dict["command"] = torch.tensor(sample_dict["cmd"])
+        elif self.action_mod == 2:
+            # scene might be empty
+            if sample_dict["speed"]:
+                data_dict["speed"] = torch.tensor(sample_dict["speed"][1:])
+            # scene might be empty
+            if sample_dict["angle"]:
+                data_dict["angle"] = torch.tensor(sample_dict["angle"][1:]) / 780
+        elif self.action_mod == 3:
+            # point might be invalid
+            if sample_dict["z"] > 0 and 0 < sample_dict["goal"][0] < 1600 and 0 < sample_dict["goal"][1] < 900:
+                data_dict["goal"] = torch.tensor([
+                    sample_dict["goal"][0] / 1600,
+                    sample_dict["goal"][1] / 900
+                ])
+        else:
+            raise ValueError
+        return data_dict
+
+    def __getitem__(self, index):
+        sample_dict = self.samples[index]
+        self.action_mod = (self.action_mod + index) % 4
+
+        image_seq = list()
+        for i in range(self.num_frames):
+            current_index = i
+            img_path = self.get_image_path(sample_dict, current_index)
+            image = self.preprocess_image(img_path)
+            image_seq.append(image)
+        return self.build_data_dict(image_seq, sample_dict)
diff --git a/vista/vwm/data/subsets/youtube.py b/vista/vwm/data/subsets/youtube.py
new file mode 100644
index 0000000000000000000000000000000000000000..7060eb3203b354945c97f7baa28cc6f9d443b25c
--- /dev/null
+++ b/vista/vwm/data/subsets/youtube.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+import os
+
+from .common import BaseDataset
+
+
+class YouTubeDataset(BaseDataset):
+    def __init__(self, data_root="data/YouTube", anno_file="annos/YouTube.json",
+                 target_height=320, target_width=576, num_frames=25):
+        if not os.path.exists(data_root):
+            raise ValueError(f"Cannot find dataset {data_root}")
+        if not os.path.exists(anno_file):
+            raise ValueError(f"Cannot find annotation {anno_file}")
+        super().__init__(data_root, anno_file, target_height, target_width, num_frames)
+        print("YouTube loaded:", len(self))
+
+    def get_image_path(self, sample_dict, current_index):
+        first_frame = sample_dict["first_frame"]
+        idx_str, ext_str = first_frame.split(".")
+        format_length = len(idx_str)
+        start_index = int(idx_str)
+        file_name = str(start_index + current_index).zfill(format_length) + "." + ext_str
+        return os.path.join(self.data_root, sample_dict["folder_name"], file_name)
diff --git a/vista/vwm/lr_scheduler.py b/vista/vwm/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c34b597da783394096c5c5058e89627e970bfe30
--- /dev/null
+++ b/vista/vwm/lr_scheduler.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import numpy as np
+
+
+class LambdaWarmUpCosineScheduler:
+    """NOTE: use with a base_lr of 1.0."""
+
+    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.0
+        self.verbosity_interval = verbosity_interval
+
+    def schedule(self, n, **kwargs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+
+        if n < self.lr_warm_up_steps:
+            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
+        else:
+            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (1 + np.cos(t * np.pi))
+        self.last_lr = lr
+        return lr
+
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+
+
+class LambdaWarmUpCosineScheduler2:
+    """
+    Supports repeated iterations, configurable via lists.
+
+    NOTE: use with a base_lr of 1.0.
+    """
+
+    def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
+        assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.0
+        self.verbosity_interval = verbosity_interval
+
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            else:
+                interval += 1
+
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"Current step: {n}, recent lr-multiplier: {self.last_f}, current cycle: {cycle}")
+
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (1 + np.cos(t * np.pi))
+        self.last_f = f
+        return f
+
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+
+
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"Current step: {n}, recent lr-multiplier: {self.last_f}, current cycle: {cycle}")
+
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (
+                    self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
+        self.last_f = f
+        return f
diff --git a/vista/vwm/models/__init__.py b/vista/vwm/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8a77e959046a16602bcc6419cc003f3fdec4a12
--- /dev/null
+++ b/vista/vwm/models/__init__.py
@@ -0,0 +1,4 @@
+from __future__ import annotations
+
+from .autoencoder import AutoencodingEngine
+from .diffusion import DiffusionEngine
diff --git a/vista/vwm/models/autoencoder.py b/vista/vwm/models/autoencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f313432ba9f29965e3bcccc81faae5681472dd
--- /dev/null
+++ b/vista/vwm/models/autoencoder.py
@@ -0,0 +1,530 @@
+from __future__ import annotations
+
+import math
+import re
+from abc import abstractmethod
+from contextlib import contextmanager
+from typing import Any, Optional, Union
+
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from packaging import version
+from pytorch_lightning import LightningModule
+
+from ..modules.autoencoding.regularizers import AbstractRegularizer
+from ..modules.ema import LitEma
+from ..util import default, get_obj_from_str, instantiate_from_config
+
+
+class AbstractAutoencoder(LightningModule):
+    """
+    This is the base class for all autoencoders, including image autoencoders, image autoencoders with discriminators,
+    unCLIP models, etc. Hence, it is fairly general, and specific features
+    (e.g. discriminator training, encoding, decoding) must be implemented in subclasses.
+    """
+
+    def __init__(
+            self,
+            ema_decay: Union[None, float] = None,
+            monitor: Union[None, str] = None,
+            input_key: str = "img"
+    ):
+        super().__init__()
+        self.input_key = input_key
+        self.use_ema = ema_decay is not None
+        if monitor is not None:
+            self.monitor = monitor
+
+        if self.use_ema:
+            self.model_ema = LitEma(self, decay=ema_decay)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}")
+
+        if version.parse(pl.__version__) >= version.parse("2.0.0"):
+            self.automatic_optimization = False
+
+    def apply_ckpt(self, ckpt: Union[None, str, dict]):
+        if ckpt is None:
+            return
+        elif isinstance(ckpt, str):
+            ckpt = {
+                "target": "vista.vwm.modules.checkpoint.CheckpointEngine",
+                "params": {"ckpt_path": ckpt}
+            }
+        engine = instantiate_from_config(ckpt)
+        engine(self)
+
+    @abstractmethod
+    def get_input(self, batch) -> Any:
+        raise NotImplementedError
+
+    def on_train_batch_end(self, *args, **kwargs):
+        # for EMA computation
+        if self.use_ema:
+            self.model_ema(self)
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+
+    @abstractmethod
+    def encode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("encode()-method of abstract base class called")
+
+    @abstractmethod
+    def decode(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError("decode()-method of abstract base class called")
+
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        print(f"Loading >>> {cfg['target']} <<< optimizer from config")
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+
+    def configure_optimizers(self) -> Any:
+        raise NotImplementedError
+
+
+class AutoencodingEngine(AbstractAutoencoder):
+    """
+    Base class for all image autoencoders that we train, like VQGAN or AutoencoderKL
+    (we also restore them explicitly as special cases for legacy reasons).
+    Regularizations such as KL or VQ are moved to the regularizer class.
+    """
+
+    def __init__(
+            self,
+            *args,
+            encoder_config: dict,
+            decoder_config: dict,
+            loss_config: dict,
+            regularizer_config: dict,
+            optimizer_config: Union[dict, None] = None,
+            lr_g_factor: float = 1.0,
+            trainable_ae_params: Optional[list[list[str]]] = None,
+            ae_optimizer_args: Optional[list[dict]] = None,
+            trainable_disc_params: Optional[list[list[str]]] = None,
+            disc_optimizer_args: Optional[list[dict]] = None,
+            disc_start_iter: int = 0,
+            diff_boost_factor: float = 3.0,
+            ckpt_engine: Union[None, str, dict] = None,
+            ckpt_path: Optional[str] = None,
+            additional_decode_keys: Optional[list[str]] = None,
+            **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        if version.parse(pl.__version__) >= version.parse("2.0.0"):  # pytorch lightning
+            self.automatic_optimization = False
+
+        self.encoder: nn.Module = instantiate_from_config(encoder_config)
+        self.decoder: nn.Module = instantiate_from_config(decoder_config)
+        self.loss: nn.Module = instantiate_from_config(loss_config)
+        self.regularization: AbstractRegularizer = instantiate_from_config(regularizer_config)
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.AdamW"}
+        )
+        self.diff_boost_factor = diff_boost_factor
+        self.disc_start_iter = disc_start_iter
+        self.lr_g_factor = lr_g_factor
+        self.trainable_ae_params = trainable_ae_params
+        if self.trainable_ae_params is not None:
+            self.ae_optimizer_args = default(
+                ae_optimizer_args,
+                [dict() for _ in range(len(self.trainable_ae_params))]
+            )
+            assert len(self.ae_optimizer_args) == len(self.trainable_ae_params)
+        else:
+            self.ae_optimizer_args = [dict()]  # makes type consistent
+
+        self.trainable_disc_params = trainable_disc_params
+        if self.trainable_disc_params is not None:
+            self.disc_optimizer_args = default(
+                disc_optimizer_args,
+                [dict() for _ in range(len(self.trainable_disc_params))]
+            )
+            assert len(self.disc_optimizer_args) == len(self.trainable_disc_params)
+        else:
+            self.disc_optimizer_args = [dict()]  # makes type consistent
+
+        if ckpt_path is not None:
+            assert ckpt_engine is None, "Cannot set ckpt_engine and ckpt_path"
+            print("Checkpoint path is deprecated, use `checkpoint_engine` instead")
+        self.apply_ckpt(default(ckpt_path, ckpt_engine))
+        self.additional_decode_keys = set(default(additional_decode_keys, list()))
+
+    def get_input(self, batch: dict) -> torch.Tensor:
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in channels-first format (e.g., bchw instead if bhwc)
+        return batch[self.input_key]
+
+    def get_autoencoder_params(self) -> list:
+        params = list()
+        if hasattr(self.loss, "get_trainable_autoencoder_parameters"):
+            params += list(self.loss.get_trainable_autoencoder_parameters())
+        if hasattr(self.regularization, "get_trainable_parameters"):
+            params += list(self.regularization.get_trainable_parameters())
+        params = params + list(self.encoder.parameters())
+        params = params + list(self.decoder.parameters())
+        return params
+
+    def get_discriminator_params(self) -> list:
+        if hasattr(self.loss, "get_trainable_parameters"):
+            params = list(self.loss.get_trainable_parameters())  # e.g., discriminator
+        else:
+            params = list()
+        return params
+
+    def get_last_layer(self):
+        return self.decoder.get_last_layer()
+
+    def encode(
+            self,
+            x: torch.Tensor,
+            return_reg_log: bool = False,
+            unregularized: bool = False
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict]]:
+        z = self.encoder(x)
+        if unregularized:
+            return z, dict()
+        else:
+            z, reg_log = self.regularization(z)
+            if return_reg_log:
+                return z, reg_log
+            else:
+                return z
+
+    def decode(self, z: torch.Tensor, **kwargs) -> torch.Tensor:
+        x = self.decoder(z, **kwargs)
+        return x
+
+    def forward(
+            self, x: torch.Tensor, **additional_decode_kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor, dict]:
+        z, reg_log = self.encode(x, return_reg_log=True)
+        dec = self.decode(z, **additional_decode_kwargs)
+        return z, dec, reg_log
+
+    def inner_training_step(
+            self, batch: dict, batch_idx: int, optimizer_idx: int = 0
+    ) -> torch.Tensor:
+        x = self.get_input(batch)
+        additional_decode_kwargs = {
+            key: batch[key] for key in self.additional_decode_keys.intersection(batch)
+        }
+        z, x_reconstruct, regularization_log = self(x, **additional_decode_kwargs)
+        if hasattr(self.loss, "forward_keys"):
+            extra_info = {
+                "z": z,
+                "optimizer_idx": optimizer_idx,
+                "global_step": self.global_step,
+                "last_layer": self.get_last_layer(),
+                "split": "train",
+                "regularization_log": regularization_log,
+                "autoencoder": self
+            }
+            extra_info = {k: extra_info[k] for k in self.loss.forward_keys}
+        else:
+            extra_info = dict()
+
+        if optimizer_idx == 0:
+            # autoencoder
+            out_loss = self.loss(x, x_reconstruct, **extra_info)
+            if isinstance(out_loss, tuple):
+                ae_loss, log_dict_ae = out_loss
+            else:
+                # simple loss function
+                ae_loss = out_loss
+                log_dict_ae = {"train/loss/rec": ae_loss.detach()}
+
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=False
+            )
+            self.log(
+                "loss",
+                ae_loss.mean().detach(),
+                prog_bar=True,
+                logger=False,
+                on_epoch=False,
+                on_step=True
+            )
+            return ae_loss
+        elif optimizer_idx == 1:
+            # discriminator
+            disc_loss, log_dict_disc = self.loss(x, x_reconstruct, **extra_info)
+            # discriminator always needs to return a tuple
+            self.log_dict(
+                log_dict_disc,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True
+            )
+            return disc_loss
+        else:
+            raise NotImplementedError(f"Unknown optimizer {optimizer_idx}")
+
+    def training_step(self, batch: dict, batch_idx: int):
+        opts = self.optimizers()
+        if not isinstance(opts, list):
+            # non-adversarial case
+            opts = [opts]
+        optimizer_idx = batch_idx % len(opts)
+        if self.global_step < self.disc_start_iter:
+            optimizer_idx = 0
+        opt = opts[optimizer_idx]
+        opt.zero_grad()
+        with opt.toggle_model():
+            loss = self.inner_training_step(batch, batch_idx, optimizer_idx=optimizer_idx)
+            self.manual_backward(loss)
+        opt.step()
+
+    def validation_step(self, batch: dict, batch_idx: int) -> dict:
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
+            log_dict.update(log_dict_ema)
+        return log_dict
+
+    def _validation_step(self, batch: dict, batch_idx: int, postfix: str = "") -> dict:
+        x = self.get_input(batch)
+
+        z, x_reconstruct, regularization_log = self(x)
+        if hasattr(self.loss, "forward_keys"):
+            extra_info = {
+                "z": z,
+                "optimizer_idx": 0,
+                "global_step": self.global_step,
+                "last_layer": self.get_last_layer(),
+                "split": "val" + postfix,
+                "regularization_log": regularization_log,
+                "autoencoder": self
+            }
+            extra_info = {k: extra_info[k] for k in self.loss.forward_keys}
+        else:
+            extra_info = dict()
+        out_loss = self.loss(x, x_reconstruct, **extra_info)
+        if isinstance(out_loss, tuple):
+            ae_loss, log_dict_ae = out_loss
+        else:
+            # simple loss function
+            ae_loss = out_loss
+            log_dict_ae = {f"val{postfix}/loss/rec": ae_loss.detach()}
+        full_log_dict = log_dict_ae
+
+        if "optimizer_idx" in extra_info:
+            extra_info["optimizer_idx"] = 1
+            _disc_loss, log_dict_disc = self.loss(x, x_reconstruct, **extra_info)
+            full_log_dict.update(log_dict_disc)
+        self.log(
+            f"val{postfix}/loss/rec",
+            log_dict_ae[f"val{postfix}/loss/rec"],
+            sync_dist=True
+        )
+        self.log_dict(
+            full_log_dict,
+            sync_dist=True
+        )
+        return full_log_dict
+
+    def get_param_groups(
+            self, parameter_names: list[list[str]], optimizer_args: list[dict]
+    ) -> tuple[list[dict[str, Any]], int]:
+        groups = list()
+        num_params = 0
+        for names, args in zip(parameter_names, optimizer_args):
+            params = list()
+            for pattern_ in names:
+                pattern_params = list()
+                pattern = re.compile(pattern_)
+                for p_name, param in self.named_parameters():
+                    if re.match(pattern, p_name):
+                        pattern_params.append(param)
+                        num_params += param.numel()
+                if len(pattern_params) == 0:
+                    print(f"Did not find parameters for pattern {pattern_}")
+                params.extend(pattern_params)
+            groups.append({"params": params, **args})
+        return groups, num_params
+
+    def configure_optimizers(self) -> list[torch.optim.Optimizer]:
+        if self.trainable_ae_params is None:
+            ae_params = self.get_autoencoder_params()
+        else:
+            ae_params, num_ae_params = self.get_param_groups(
+                self.trainable_ae_params, self.ae_optimizer_args
+            )
+            print(f"Number of trainable autoencoder parameters: {num_ae_params:,}")
+        if self.trainable_disc_params is None:
+            disc_params = self.get_discriminator_params()
+        else:
+            disc_params, num_disc_params = self.get_param_groups(
+                self.trainable_disc_params, self.disc_optimizer_args
+            )
+            print(f"Number of trainable discriminator parameters: {num_disc_params:,}")
+        opt_ae = self.instantiate_optimizer_from_config(
+            ae_params,
+            default(self.lr_g_factor, 1.0) * self.learning_rate,
+            self.optimizer_config
+        )
+        opts = [opt_ae]
+        if len(disc_params) > 0:
+            opt_disc = self.instantiate_optimizer_from_config(
+                disc_params,
+                self.learning_rate,
+                self.optimizer_config
+            )
+            opts.append(opt_disc)
+        return opts
+
+    @torch.no_grad()
+    def log_images(
+            self, batch: dict, additional_log_kwargs: Optional[dict] = None, **kwargs
+    ) -> dict:
+        log = dict()
+        additional_decode_kwargs = dict()
+        x = self.get_input(batch)
+        additional_decode_kwargs.update(
+            {key: batch[key] for key in self.additional_decode_keys.intersection(batch)}
+        )
+
+        _, x_reconstruct, _ = self(x, **additional_decode_kwargs)
+        log["inputs"] = x
+        log["reconstructions"] = x_reconstruct
+        diff = 0.5 * torch.abs(torch.clamp(x_reconstruct, -1.0, 1.0) - x)
+        diff.clamp_(0, 1.0)
+        log["diff"] = 2.0 * diff - 1.0
+        # diff_boost shows location of small errors, by boosting their brightness
+        log["diff_boost"] = 2.0 * torch.clamp(self.diff_boost_factor * diff, 0.0, 1.0) - 1
+        if hasattr(self.loss, "log_images"):
+            log.update(self.loss.log_images(x, x_reconstruct))
+        with self.ema_scope():
+            _, x_reconstruct_ema, _ = self(x, **additional_decode_kwargs)
+            log["reconstructions_ema"] = x_reconstruct_ema
+            diff_ema = 0.5 * torch.abs(torch.clamp(x_reconstruct_ema, -1.0, 1.0) - x)
+            diff_ema.clamp_(0, 1.0)
+            log["diff_ema"] = 2.0 * diff_ema - 1.0
+            log["diff_boost_ema"] = 2.0 * torch.clamp(self.diff_boost_factor * diff_ema, 0.0, 1.0) - 1
+        if additional_log_kwargs:
+            additional_decode_kwargs.update(additional_log_kwargs)
+            _, x_reconstruct_add, _ = self(x, **additional_decode_kwargs)
+            log_str = "reconstructions-" + "-".join(
+                [f"{key}={additional_log_kwargs[key]}" for key in additional_log_kwargs]
+            )
+            log[log_str] = x_reconstruct_add
+        return log
+
+
+class AutoencodingEngineLegacy(AutoencodingEngine):
+    def __init__(self, embed_dim: int, **kwargs):
+        self.max_batch_size = kwargs.pop("max_batch_size", None)
+        ddconfig = kwargs.pop("ddconfig")
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ckpt_engine = kwargs.pop("ckpt_engine", None)
+        super().__init__(
+            encoder_config={
+                "target": "vista.vwm.modules.diffusionmodules.model.Encoder",
+                "params": ddconfig
+            },
+            decoder_config={
+                "target": "vista.vwm.modules.diffusionmodules.model.Decoder",
+                "params": ddconfig
+            },
+            **kwargs
+        )
+        self.quant_conv = nn.Conv2d(
+            (1 + ddconfig["double_z"]) * ddconfig["z_channels"],
+            (1 + ddconfig["double_z"]) * embed_dim,
+            1
+        )
+        self.post_quant_conv = nn.Conv2d(
+            embed_dim,
+            ddconfig["z_channels"],
+            1
+        )
+        self.embed_dim = embed_dim
+
+        self.apply_ckpt(default(ckpt_path, ckpt_engine))
+
+    def get_autoencoder_params(self) -> list:
+        params = super().get_autoencoder_params()
+        return params
+
+    def encode(
+            self, x: torch.Tensor, return_reg_log: bool = False
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict]]:
+        if self.max_batch_size is None:
+            z = self.encoder(x)
+            z = self.quant_conv(z)
+        else:
+            N = x.shape[0]
+            bs = self.max_batch_size
+            n_batches = int(math.ceil(N / bs))
+            z = list()
+            for i_batch in range(n_batches):
+                z_batch = self.encoder(x[i_batch * bs: (i_batch + 1) * bs])
+                z_batch = self.quant_conv(z_batch)
+                z.append(z_batch)
+            z = torch.cat(z, 0)
+
+        z, reg_log = self.regularization(z)
+        if return_reg_log:
+            return z, reg_log
+        else:
+            return z
+
+    def decode(self, z: torch.Tensor, **decoder_kwargs) -> torch.Tensor:
+        if self.max_batch_size is None:
+            dec = self.post_quant_conv(z)
+            dec = self.decoder(dec, **decoder_kwargs)
+        else:
+            N = z.shape[0]
+            bs = self.max_batch_size
+            n_batches = int(math.ceil(N / bs))
+            dec = list()
+            for i_batch in range(n_batches):
+                dec_batch = self.post_quant_conv(z[i_batch * bs: (i_batch + 1) * bs])
+                dec_batch = self.decoder(dec_batch, **decoder_kwargs)
+                dec.append(dec_batch)
+            dec = torch.cat(dec, 0)
+        return dec
+
+
+class AutoencoderKL(AutoencodingEngineLegacy):
+    def __init__(self, **kwargs):
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "vista.vwm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer"
+                )
+            },
+            **kwargs
+        )
+
+
+class AutoencoderKLModeOnly(AutoencodingEngineLegacy):
+    def __init__(self, **kwargs):
+        super().__init__(
+            regularizer_config={
+                "target": (
+                    "vista.vwm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer"
+                ),
+                "params": {"sample": False},
+            },
+            **kwargs
+        )
diff --git a/vista/vwm/models/diffusion.py b/vista/vwm/models/diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae28209229d67a3443f888505ba7b2c02c582e8
--- /dev/null
+++ b/vista/vwm/models/diffusion.py
@@ -0,0 +1,394 @@
+from __future__ import annotations
+
+import math
+from contextlib import contextmanager
+from typing import Any, Union
+
+import torch
+from einops import rearrange
+from omegaconf import ListConfig, OmegaConf
+from pytorch_lightning import LightningModule
+from safetensors.torch import load_file as load_safetensors
+from torch.optim.lr_scheduler import LambdaLR
+
+from ..modules import UNCONDITIONAL_CONFIG
+from ..modules.autoencoding.temporal_ae import VideoDecoder
+from ..modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
+from ..modules.ema import LitEma
+from ..util import default, disabled_train, get_obj_from_str, instantiate_from_config
+
+
+class DiffusionEngine(LightningModule):
+    def __init__(
+            self,
+            network_config,
+            denoiser_config,
+            first_stage_config,
+            conditioner_config: Union[None, dict, ListConfig, OmegaConf] = None,
+            sampler_config: Union[None, dict, ListConfig, OmegaConf] = None,
+            optimizer_config: Union[None, dict, ListConfig, OmegaConf] = None,
+            scheduler_config: Union[None, dict, ListConfig, OmegaConf] = None,
+            loss_fn_config: Union[None, dict, ListConfig, OmegaConf] = None,
+            network_wrapper: Union[None, str] = None,
+            ckpt_path: Union[None, str] = None,
+            use_ema: bool = False,
+            ema_decay_rate: float = 0.9999,
+            scale_factor: float = 1.0,
+            disable_first_stage_autocast=False,
+            input_key: str = "img",
+            log_keys: Union[list, None] = None,
+            no_cond_log: bool = False,
+            compile_model: bool = False,
+            en_and_decode_n_samples_a_time: int = 1,
+            num_frames: int = 25,
+            slow_spatial_layers: bool = False,
+            train_peft_adapters: bool = False,
+            replace_cond_frames: bool = False,
+            fixed_cond_frames: Union[list, None] = None
+    ):
+        super().__init__()
+        self.log_keys = log_keys
+        self.input_key = input_key
+        self.optimizer_config = default(
+            optimizer_config, {"target": "torch.optim.AdamW"}
+        )
+        model = instantiate_from_config(network_config)
+        self.model = get_obj_from_str(
+            default(network_wrapper, OPENAIUNETWRAPPER)
+        )(
+            model, compile_model=compile_model
+        )
+
+        self.denoiser = instantiate_from_config(denoiser_config)
+        self.sampler = (
+            instantiate_from_config(sampler_config)
+            if sampler_config is not None
+            else None
+        )
+        self.conditioner = instantiate_from_config(
+            default(conditioner_config, UNCONDITIONAL_CONFIG)
+        )
+        self.scheduler_config = scheduler_config
+        self._init_first_stage(first_stage_config)
+
+        self.loss_fn = (
+            instantiate_from_config(loss_fn_config)
+            if loss_fn_config is not None
+            else None
+        )
+
+        # if slow_spatial_layers:
+        #     for n, p in self.model.named_parameters():
+        #         if "time_stack" not in n:
+        #             p.requires_grad = False
+        # elif train_peft_adapters:
+        #     for n, p in self.model.named_parameters():
+        #         if "adapter" not in n and p.requires_grad:
+        #             p.requires_grad = False
+
+        self.use_ema = use_ema
+        self.ema_decay_rate = ema_decay_rate
+        if use_ema:
+            self.model_ema = LitEma(self.model, decay=ema_decay_rate)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}")
+
+        self.scale_factor = scale_factor
+        self.disable_first_stage_autocast = disable_first_stage_autocast
+        self.no_cond_log = no_cond_log
+
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path)
+
+        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time
+        self.num_frames = num_frames
+        self.slow_spatial_layers = slow_spatial_layers
+        self.train_peft_adapters = train_peft_adapters
+        self.replace_cond_frames = replace_cond_frames
+        self.fixed_cond_frames = fixed_cond_frames
+
+    def reinit_ema(self):
+        if self.use_ema:
+            self.model_ema = LitEma(self.model, decay=self.ema_decay_rate)
+            print(f"Reinitializing EMAs of {len(list(self.model_ema.buffers()))}")
+
+    def init_from_ckpt(self, path: str) -> None:
+        if path.endswith("ckpt"):
+            svd = torch.load(path, map_location="cpu")["state_dict"]
+        elif path.endswith("bin"):  # for deepspeed merged checkpoints
+            svd = torch.load(path, map_location="cpu")
+            for k in list(svd.keys()):  # remove the prefix
+                if "_forward_module" in k:
+                    svd[k.replace("_forward_module.", "")] = svd[k]
+                del svd[k]
+        elif path.endswith("safetensors"):
+            svd = load_safetensors(path)
+        else:
+            raise NotImplementedError
+
+        missing, unexpected = self.load_state_dict(svd, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected keys: {unexpected}")
+
+    def _init_first_stage(self, config):
+        model = instantiate_from_config(config).eval()
+        model.train = disabled_train
+        for param in model.parameters():
+            param.requires_grad = False
+        self.first_stage_model = model
+
+    def get_input(self, batch):
+        # assuming unified data format, dataloader returns a dict
+        # image tensors should be scaled to -1 ... 1 and in bchw format
+        input_shape = batch[self.input_key].shape
+        if len(input_shape) != 4:  # is an image sequence
+            assert input_shape[1] == self.num_frames
+            batch[self.input_key] = rearrange(batch[self.input_key], "b t c h w -> (b t) c h w")
+        return batch[self.input_key]
+
+    @torch.no_grad()
+    def decode_first_stage(self, z, overlap=3):
+        z = z / self.scale_factor
+        n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])
+        all_out = list()
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            if overlap < n_samples:
+                previous_z = z[:overlap]
+                for current_z in z[overlap:].split(n_samples - overlap, dim=0):
+                    if isinstance(self.first_stage_model.decoder, VideoDecoder):
+                        kwargs = {"timesteps": current_z.shape[0] + overlap}
+                    else:
+                        kwargs = dict()
+                    context_z = torch.cat((previous_z, current_z), dim=0)
+                    previous_z = current_z[-overlap:]
+                    out = self.first_stage_model.decode(context_z, **kwargs)
+
+                    if not all_out:
+                        all_out.append(out)
+                    else:
+                        all_out[-1][-overlap:] = (all_out[-1][-overlap:] + out[:overlap]) / 2
+                        all_out.append(out[overlap:])
+            else:
+                for current_z in z.split(n_samples, dim=0):
+                    if isinstance(self.first_stage_model.decoder, VideoDecoder):
+                        kwargs = {"timesteps": current_z.shape[0]}
+                    else:
+                        kwargs = dict()
+                    out = self.first_stage_model.decode(current_z, **kwargs)
+                    all_out.append(out)
+        out = torch.cat(all_out, dim=0)
+        return out
+
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
+        n_rounds = math.ceil(x.shape[0] / n_samples)
+        all_out = list()
+        torch.cuda.synchronize()
+        print(f"Encoding {n_rounds} rounds of {n_samples} samples each")
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            for n in range(n_rounds):
+                torch.cuda.synchronize()
+                print("start encoding round", n)
+                out = self.first_stage_model.encode(
+                    x[n * n_samples: (n + 1) * n_samples]
+                )
+                all_out.append(out)
+                torch.cuda.synchronize()
+                print("finished encoding round", n)
+        z = torch.cat(all_out, dim=0)
+        z = z * self.scale_factor
+        return z
+
+    def forward(self, x, batch):
+        loss = self.loss_fn(self.model, self.denoiser, self.conditioner, x, batch)  # go to StandardDiffusionLoss
+        loss_mean = loss.mean()
+        loss_dict = {"loss": loss_mean}
+        return loss_mean, loss_dict
+
+    def shared_step(self, batch: dict) -> Any:
+        x = self.get_input(batch)
+        x = self.encode_first_stage(x)
+        batch["global_step"] = self.global_step
+        loss, loss_dict = self(x, batch)
+        return loss, loss_dict
+
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+
+        self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+
+        self.log("global_step", self.global_step, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+
+        if self.scheduler_config is not None:
+            lr = self.optimizers().param_groups[0]["lr"]
+            self.log("lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+        return loss
+
+    # @torch.no_grad()
+    # def validation_step(self, batch, batch_idx):
+    #     loss, loss_dict = self.shared_step(batch)
+    #     self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+
+    @torch.no_grad()
+    def test_step(self, batch, batch_idx):
+        _loss, loss_dict = self.shared_step(batch)
+        self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+
+    def on_train_start(self, *args, **kwargs):
+        if self.sampler is None or self.loss_fn is None:
+            raise ValueError("Sampler and loss function need to be set for training")
+
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        return get_obj_from_str(cfg["target"])(
+            params, lr=lr, **cfg.get("params", dict())
+        )
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        if self.slow_spatial_layers:
+            param_dicts = [
+                {
+                    "params": [p for n, p in self.model.named_parameters() if "time_stack" in n]
+                },
+                {
+                    "params": [p for n, p in self.model.named_parameters() if "time_stack" not in n],
+                    "lr": lr * 0.1
+                }
+            ]
+        elif self.train_peft_adapters:
+            param_dicts = [
+                {
+                    "params": [p for n, p in self.model.named_parameters() if "adapter" in n]
+                }
+            ]
+        else:
+            param_dicts = [
+                {
+                    "params": list(self.model.parameters())
+                }
+            ]
+        for embedder in self.conditioner.embedders:
+            if embedder.is_trainable:
+                param_dicts.append(
+                    {
+                        "params": list(embedder.parameters())
+                    }
+                )
+        opt = self.instantiate_optimizer_from_config(param_dicts, lr, self.optimizer_config)
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    "scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule),
+                    "interval": "step",
+                    "frequency": 1
+                }
+            ]
+            return [opt], scheduler
+        else:
+            return opt
+
+    @torch.no_grad()
+    def sample(
+            self,
+            cond: dict,
+            cond_frame=None,
+            uc: Union[dict, None] = None,
+            N: int = 25,
+            shape: Union[None, tuple, list] = None,
+            **kwargs
+    ):
+        randn = torch.randn(N, *shape).to(self.device)
+        cond_mask = torch.zeros(N).to(self.device)
+        if self.replace_cond_frames:
+            assert self.fixed_cond_frames
+            cond_indices = self.fixed_cond_frames
+            cond_mask = rearrange(cond_mask, "(b t) -> b t", t=self.num_frames)
+            cond_mask[:, cond_indices] = 1
+            cond_mask = rearrange(cond_mask, "b t -> (b t)")
+
+        def denoiser(input, sigma, c, cond_mask):
+            return self.denoiser(self.model, input, sigma, c, cond_mask, **kwargs)
+        samples = self.sampler(  # go to EulerEDMSampler
+            denoiser, randn, cond, uc=uc, cond_frame=cond_frame, cond_mask=cond_mask
+        )
+        return samples
+
+    @torch.no_grad()
+    def log_images(
+            self,
+            batch: dict,
+            N: int = 25,
+            sample: bool = True,
+            ucg_keys: list[str] = None,
+            **kwargs
+    ) -> dict:
+        conditioner_input_keys = [e.input_key for e in self.conditioner.embedders if e.ucg_rate > 0.0]
+        if ucg_keys:
+            assert all(map(lambda x: x in conditioner_input_keys, ucg_keys)), (
+                "Each defined ucg key for sampling must be in the provided conditioner input keys, "
+                f"but we have {ucg_keys} vs. {conditioner_input_keys}"
+            )
+        else:
+            ucg_keys = conditioner_input_keys
+        log = dict()
+
+        x = self.get_input(batch)
+
+        c, uc = self.conditioner.get_unconditional_conditioning(
+            batch,
+            force_uc_zero_embeddings=ucg_keys
+            if len(self.conditioner.embedders) > 0
+            else list()
+        )
+
+        sampling_kwargs = dict()
+
+        N = min(x.shape[0], N)
+        x = x.to(self.device)[:N]
+
+        z = self.encode_first_stage(x)
+        x_reconstruct = self.decode_first_stage(z)
+
+        for k in c:
+            if isinstance(c[k], torch.Tensor):
+                c[k], uc[k] = map(lambda y: y[k][:N].to(self.device), (c, uc))
+                if c[k].shape[0] < N:
+                    c[k] = c[k][[0]]
+                if uc[k].shape[0] < N:
+                    uc[k] = uc[k][[0]]
+
+        if sample:
+            with self.ema_scope("Plotting"):
+                samples = self.sample(
+                    c, cond_frame=z, shape=z.shape[1:], uc=uc, N=N, **sampling_kwargs
+                )
+            samples = self.decode_first_stage(samples)
+            log["samples"] = log["samples_mp4"] = samples
+
+        log["inputs"] = log["inputs_mp4"] = x
+        log["targets"] = log["targets_mp4"] = x_reconstruct
+        return log
diff --git a/vista/vwm/modules/__init__.py b/vista/vwm/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac8d363a0c822d00e7d6e19d3e11aa86f5c3fb0b
--- /dev/null
+++ b/vista/vwm/modules/__init__.py
@@ -0,0 +1,8 @@
+from __future__ import annotations
+
+from .encoders.modules import GeneralConditioner
+
+UNCONDITIONAL_CONFIG = {
+    "target": "vista.vwm.modules.GeneralConditioner",
+    "params": {"emb_models": list()}
+}
diff --git a/vista/vwm/modules/attention.py b/vista/vwm/modules/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..3976cb24fb830a3ba391345d7a3d97ced0a35e49
--- /dev/null
+++ b/vista/vwm/modules/attention.py
@@ -0,0 +1,632 @@
+from __future__ import annotations
+
+import math
+from inspect import isfunction
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from packaging import version
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+if version.parse(torch.__version__) >= version.parse("2.0.0"):
+    SDP_IS_AVAILABLE = True
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+
+    BACKEND_MAP = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True
+        },
+        None: {
+            "enable_math": True,
+            "enable_flash": True,
+            "enable_mem_efficient": True
+        }
+    }
+else:
+    from contextlib import nullcontext
+
+    SDP_IS_AVAILABLE = False
+    sdp_kernel = nullcontext
+    BACKEND_MAP = dict()
+    print(
+        f"No SDP backend available, likely because you are running in pytorch versions < 2.0. "
+        f"In fact, you are using PyTorch {torch.__version__}. You might want to consider upgrading"
+    )
+
+try:
+    import xformers
+    import xformers.ops
+
+    XFORMERS_IS_AVAILABLE = True
+except:
+    XFORMERS_IS_AVAILABLE = False
+    print("No module `xformers`, processing without it")
+
+
+def exists(val):
+    return val is not None
+
+
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    else:
+        return d() if isfunction(d) else d
+
+
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_out=None,
+            mult=4,
+            glu=False,
+            dropout=0.0,
+            zero_init=False
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(
+                nn.Linear(dim, inner_dim),
+                nn.GELU()
+            )
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+
+        if zero_init:
+            nn.init.zeros_(self.net[-1].weight)
+            nn.init.zeros_(self.net[-1].bias)
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def zero_module(module):
+    """Zero out the parameters of a module and return it."""
+
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def Normalize(in_channels):
+    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+    def forward(self, x):
+        _b, _c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(
+            qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
+        )
+        k = k.softmax(dim=-1)
+        context = torch.einsum("bhdn,bhen->bhde", k, v)
+        out = torch.einsum("bhde,bhdn->bhen", context, q)
+        out = rearrange(out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+
+
+class CrossAttention(nn.Module):  # not used, never mind
+    def __init__(
+            self,
+            query_dim,
+            context_dim=None,
+            heads=8,
+            dim_head=64,
+            dropout=0.0,
+            backend=None,
+            zero_init=False,
+            **kwargs
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+        self.backend = backend
+
+        if zero_init:
+            nn.init.zeros_(self.to_out[0].weight)
+            nn.init.zeros_(self.to_out[0].bias)
+
+    def forward(
+            self,
+            x,
+            context=None,
+            mask=None,
+            additional_tokens=None,
+            n_times_crossframe_attn_in_self=0,
+            **kwargs
+    ):
+        num_heads = self.heads
+
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat((additional_tokens, x), dim=1)
+
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            n_cp = x.shape[0] // n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=num_heads), (q, k, v))
+
+        with sdp_kernel(**BACKEND_MAP[self.backend]):
+            out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)  # scale is dim_head ** -0.5 per default
+
+        del q, k, v
+        out = rearrange(out, "b h n d -> b n (h d)", h=num_heads)
+
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        return self.to_out(out)
+
+
+class MemoryEfficientCrossAttention(nn.Module):  # we are using this implementation
+    # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    def __init__(
+            self,
+            query_dim,
+            context_dim=None,
+            heads=8,
+            dim_head=64,
+            dropout=0.0,
+            zero_init=False,
+            causal=False,
+            add_lora=False,
+            lora_rank=16,
+            lora_scale=1.0,
+            action_control=False,
+            **kwargs
+    ):
+        super().__init__()
+        print(
+            f"Setting up {self.__class__.__name__}. "
+            f"Query dim is {query_dim}, "
+            f"context_dim is {context_dim} and using {heads} heads with a dimension of {dim_head}"
+        )
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.heads = heads
+        self.dim_head = dim_head
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+        self.attention_op: Optional[Any] = None
+
+        if causal:
+            self.attn_bias = xformers.ops.LowerTriangularMask()
+        else:
+            self.attn_bias = None
+
+        if zero_init:
+            nn.init.zeros_(self.to_out[0].weight)
+            nn.init.zeros_(self.to_out[0].bias)
+
+        self.add_lora = add_lora
+        if add_lora:
+            self.lora_scale = lora_scale
+
+            self.q_adapter_down = nn.Linear(query_dim, lora_rank, bias=False)
+            nn.init.normal_(self.q_adapter_down.weight, std=1 / lora_rank)
+            self.q_adapter_up = nn.Linear(lora_rank, inner_dim, bias=False)
+            nn.init.zeros_(self.q_adapter_up.weight)
+
+            self.k_adapter_down = nn.Linear(context_dim, lora_rank, bias=False)
+            nn.init.normal_(self.k_adapter_down.weight, std=1 / lora_rank)
+            self.k_adapter_up = nn.Linear(lora_rank, inner_dim, bias=False)
+            nn.init.zeros_(self.k_adapter_up.weight)
+
+            self.v_adapter_down = nn.Linear(context_dim, lora_rank, bias=False)
+            nn.init.normal_(self.v_adapter_down.weight, std=1 / lora_rank)
+            self.v_adapter_up = nn.Linear(lora_rank, inner_dim, bias=False)
+            nn.init.zeros_(self.v_adapter_up.weight)
+
+            self.out_adapter_down = nn.Linear(inner_dim, lora_rank, bias=False)
+            nn.init.normal_(self.out_adapter_down.weight, std=1 / lora_rank)
+            self.out_adapter_up = nn.Linear(lora_rank, query_dim, bias=False)
+            nn.init.zeros_(self.out_adapter_up.weight)
+
+        self.action_control = action_control
+        if action_control:
+            self.context_dim = context_dim
+            self.k_adapter_action_control = nn.Linear(128 * 19, inner_dim, bias=False)
+            nn.init.zeros_(self.k_adapter_action_control.weight)
+            self.v_adapter_action_control = nn.Linear(128 * 19, inner_dim, bias=False)
+            nn.init.zeros_(self.v_adapter_action_control.weight)
+
+    def forward(
+            self,
+            x,
+            context=None,
+            mask=None,
+            additional_tokens=None,
+            n_times_crossframe_attn_in_self=0,
+            batchify_xformers=False
+    ):
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat((additional_tokens, x), dim=1)
+
+        context = default(context, x)
+        if self.action_control:
+            context, context_ = context[:, :, :self.context_dim], context[:, :, self.context_dim:]
+        q = self.to_q(x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if self.add_lora:
+            q += self.q_adapter_up(self.q_adapter_down(x)) * self.lora_scale
+            k += self.k_adapter_up(self.k_adapter_down(context)) * self.lora_scale
+            v += self.v_adapter_up(self.v_adapter_down(context)) * self.lora_scale
+        if self.action_control:
+            k += self.k_adapter_action_control(context_)
+            v += self.v_adapter_action_control(context_)
+
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            # n_cp = x.shape[0] // n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self],
+                "b ... -> (b n) ...",
+                n=n_times_crossframe_attn_in_self
+            )
+
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v)
+        )
+
+        if exists(mask):
+            raise NotImplementedError
+        else:
+            # actually compute the attention, what we cannot get enough of
+            if batchify_xformers:
+                max_bs = 32768  # >65536 will result in wrong outputs
+                n_batches = math.ceil(q.shape[0] / max_bs)
+                out = list()
+                for i_batch in range(n_batches):
+                    batch = slice(i_batch * max_bs, (i_batch + 1) * max_bs)
+                    out.append(
+                        xformers.ops.memory_efficient_attention(
+                            q[batch],
+                            k[batch],
+                            v[batch],
+                            attn_bias=self.attn_bias,
+                            op=self.attention_op
+                        )
+                    )
+                out = torch.cat(out, 0)
+            else:
+                out = xformers.ops.memory_efficient_attention(
+                    q,
+                    k,
+                    v,
+                    attn_bias=self.attn_bias,
+                    op=self.attention_op
+                )
+
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
+        if self.add_lora:
+            return self.to_out(out) + self.out_adapter_up(self.out_adapter_down(out)) * self.lora_scale
+        else:
+            return self.to_out(out)
+
+
+class BasicTransformerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention  # ampere
+    }
+
+    def __init__(
+            self,
+            dim,
+            n_heads,
+            d_head,
+            dropout=0.0,
+            context_dim=None,
+            gated_ff=True,
+            use_checkpoint=False,
+            disable_self_attn=False,
+            attn_mode="softmax",
+            sdp_backend=None,
+            add_lora=False,
+            action_control=False
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        if attn_mode != "softmax" and not XFORMERS_IS_AVAILABLE:
+            print(
+                f"Attention mode `{attn_mode}` is not available. Falling back to native attention. "
+                f"This is not a problem in Pytorch >= 2.0. You are running with PyTorch version {torch.__version__}"
+            )
+            attn_mode = "softmax"
+        elif attn_mode == "softmax" and not SDP_IS_AVAILABLE:
+            print("We do not support vanilla attention anymore, as it is too expensive")
+            if not XFORMERS_IS_AVAILABLE:
+                assert (
+                    False
+                ), "Please install xformers via e.g. `pip install xformers==0.0.16`"
+            else:
+                print("Falling back to xformers efficient attention")
+                attn_mode = "softmax-xformers"
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            assert sdp_backend is None or isinstance(sdp_backend, SDPBackend)
+        else:
+            assert sdp_backend is None
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            context_dim=context_dim if self.disable_self_attn else None,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            backend=sdp_backend,
+            add_lora=add_lora
+        )  # is a self-attn if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = attn_cls(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            backend=sdp_backend,
+            add_lora=add_lora,
+            action_control=action_control
+        )  # is self-attn if context is None
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.use_checkpoint = use_checkpoint
+        if self.use_checkpoint:
+            print(f"{self.__class__.__name__} is using checkpointing")
+
+    def forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
+        kwargs = {"x": x}
+
+        if context is not None:
+            kwargs.update({"context": context})
+
+        if additional_tokens is not None:
+            kwargs.update({"additional_tokens": additional_tokens})
+
+        if n_times_crossframe_attn_in_self:
+            kwargs.update({"n_times_crossframe_attn_in_self": n_times_crossframe_attn_in_self})
+
+        if self.use_checkpoint:
+            # inputs = {"x": x, "context": context}
+            # return checkpoint(self._forward, inputs, self.parameters(), self.use_checkpoint)
+            return checkpoint(self._forward, x, context)
+        else:
+            return self._forward(**kwargs)
+
+    def _forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
+        # spatial self-attn
+        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None,
+                       additional_tokens=additional_tokens,
+                       n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self
+                       if not self.disable_self_attn else 0) + x
+        # spatial cross-attn
+        x = self.attn2(self.norm2(x), context=context, additional_tokens=additional_tokens) + x
+        # feedforward
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding) and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image.
+
+    use_linear for more efficiency instead of the 1x1 convs.
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            n_heads,
+            d_head,
+            depth=1,
+            dropout=0.0,
+            context_dim=None,
+            disable_self_attn=False,
+            use_linear=False,
+            attn_type="softmax",
+            use_checkpoint=False,
+            sdp_backend=None,
+            add_lora=False,
+            action_control=False
+    ):
+        super().__init__()
+        print(f"Constructing {self.__class__.__name__} of depth {depth} w/ {in_channels} channels and {n_heads} heads")
+
+        from omegaconf import ListConfig
+
+        if exists(context_dim) and not isinstance(context_dim, (list, ListConfig)):
+            context_dim = [context_dim]
+        if exists(context_dim) and isinstance(context_dim, list):
+            if depth != len(context_dim):
+                print(
+                    f"WARNING: "
+                    f"{self.__class__.__name__}: found context dims {context_dim} of depth {len(context_dim)}, "
+                    f"which does not match the specified depth of {depth}. "
+                    f"Setting context_dim to {depth * [context_dim[0]]} now"
+                )
+                # depth does not match context dims
+                assert all(
+                    map(lambda x: x == context_dim[0], context_dim)
+                ), "Need homogenous context_dim to match depth automatically"
+                context_dim = depth * [context_dim[0]]
+        elif context_dim is None:
+            context_dim = [None] * depth
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        if use_linear:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    context_dim=context_dim[d],
+                    disable_self_attn=disable_self_attn,
+                    attn_mode=attn_type,
+                    use_checkpoint=use_checkpoint,
+                    sdp_backend=sdp_backend,
+                    add_lora=add_lora,
+                    action_control=action_control
+                )
+                for d in range(depth)
+            ]
+        )
+        if use_linear:
+            self.proj_out = zero_module(
+                nn.Linear(inner_dim, in_channels)
+            )
+        else:
+            self.proj_out = zero_module(
+                nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+            )
+        self.use_linear = use_linear
+
+    def forward(self, x, context=None):
+        # NOTE: if no context is given, cross-attn defaults to self-attn
+        if not isinstance(context, list):
+            context = [context]
+        _b, _c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c").contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        for i, block in enumerate(self.transformer_blocks):
+            if i > 0 and len(context) == 1:
+                i = 0  # use same context for each block
+            x = block(x, context=context[i])
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = self.proj_out(x)
+        return x + x_in
diff --git a/vista/vwm/modules/autoencoding/regularizers/__init__.py b/vista/vwm/modules/autoencoding/regularizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..78bf310ab8f6014c231e9e9af6384aed7e9be64b
--- /dev/null
+++ b/vista/vwm/modules/autoencoding/regularizers/__init__.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import Any, Tuple
+
+import torch
+from torch import nn
+
+from ....modules.distributions.distributions import DiagonalGaussianDistribution
+
+
+class AbstractRegularizer(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, z: torch.Tensor) -> tuple[torch.Tensor, dict]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_trainable_parameters(self) -> Any:
+        raise NotImplementedError
+
+
+class DiagonalGaussianRegularizer(AbstractRegularizer):
+    def __init__(self, sample: bool = True):
+        super().__init__()
+        self.sample = sample
+
+    def get_trainable_parameters(self) -> Any:
+        yield from ()
+
+    def forward(self, z: torch.Tensor) -> tuple[torch.Tensor, dict]:
+        log = dict()
+        posterior = DiagonalGaussianDistribution(z)
+        if self.sample:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        kl_loss = posterior.kl()
+        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+        log["kl_loss"] = kl_loss
+        return z, log
diff --git a/vista/vwm/modules/autoencoding/temporal_ae.py b/vista/vwm/modules/autoencoding/temporal_ae.py
new file mode 100644
index 0000000000000000000000000000000000000000..20f658301e6fe66a702b5b424ae95f4d9b5961aa
--- /dev/null
+++ b/vista/vwm/modules/autoencoding/temporal_ae.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+from typing import Callable, Iterable, Union
+
+import torch
+from einops import rearrange
+
+from ...util import partialclass
+from ..diffusionmodules.model import Decoder, ResnetBlock
+from ..diffusionmodules.openaimodel import ResBlock
+
+
+class VideoResBlock(ResnetBlock):
+    def __init__(
+            self,
+            out_channels,
+            *args,
+            dropout=0.0,
+            video_kernel_size=3,
+            alpha=0.0,
+            merge_strategy="learned",
+            **kwargs
+    ):
+        super().__init__(out_channels=out_channels, dropout=dropout, *args, **kwargs)
+        if video_kernel_size is None:
+            video_kernel_size = [3, 1, 1]
+        self.time_stack = ResBlock(
+            channels=out_channels,
+            emb_channels=0,
+            dropout=dropout,
+            dims=3,
+            use_scale_shift_norm=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            kernel_size=video_kernel_size,
+            use_checkpoint=False,
+            skip_t_emb=True
+        )
+
+        self.merge_strategy = merge_strategy
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned":
+            self.register_parameter("mix_factor", torch.nn.Parameter(torch.Tensor([alpha])))
+        else:
+            raise ValueError(f"Unknown merge strategy {self.merge_strategy}")
+
+    def get_alpha(self):
+        if self.merge_strategy == "fixed":
+            return self.mix_factor
+        elif self.merge_strategy == "learned":
+            return torch.sigmoid(self.mix_factor)
+        else:
+            raise NotImplementedError
+
+    def forward(self, x, temb, skip_video=False, timesteps=None):
+        if timesteps is None:
+            timesteps = self.timesteps
+
+        x = super().forward(x, temb)
+
+        if not skip_video:
+            x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+
+            x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+
+            x = self.time_stack(x, temb)
+
+            alpha = self.get_alpha()
+            x = alpha * x + (1.0 - alpha) * x_mix
+
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+
+
+class AE3DConv(torch.nn.Conv2d):
+    def __init__(self, in_channels, out_channels, video_kernel_size=3, *args, **kwargs):
+        super().__init__(in_channels, out_channels, *args, **kwargs)
+        if isinstance(video_kernel_size, Iterable):
+            padding = [int(k // 2) for k in video_kernel_size]
+        else:
+            padding = int(video_kernel_size // 2)
+
+        self.time_mix_conv = torch.nn.Conv3d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=video_kernel_size,
+            padding=padding
+        )
+
+    def forward(self, input, timesteps, skip_video=False):
+        x = super().forward(input)
+        if skip_video:
+            return x
+        else:
+            x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+            x = self.time_mix_conv(x)
+            return rearrange(x, "b c t h w -> (b t) c h w")
+
+
+class Conv2DWrapper(torch.nn.Conv2d):
+    def forward(self, input: torch.Tensor, **kwargs) -> torch.Tensor:
+        return super().forward(input)
+
+
+class VideoDecoder(Decoder):
+    available_time_modes = ["all", "conv-only", "attn-only"]
+
+    def __init__(
+            self,
+            *args,
+            video_kernel_size: Union[int, list] = 3,
+            alpha: float = 0.0,
+            merge_strategy: str = "learned",
+            time_mode: str = "conv-only",
+            **kwargs
+    ):
+        self.video_kernel_size = video_kernel_size
+        self.alpha = alpha
+        self.merge_strategy = merge_strategy
+        self.time_mode = time_mode
+        assert (
+                self.time_mode in self.available_time_modes
+        ), f"time_mode parameter has to be in {self.available_time_modes}"
+        super().__init__(*args, **kwargs)
+
+    def get_last_layer(self, skip_time_mix=False, **kwargs):
+        if self.time_mode == "attn-only":
+            raise NotImplementedError
+        else:
+            return (
+                self.conv_out.time_mix_conv.weight
+                if not skip_time_mix
+                else self.conv_out.weight
+            )
+
+    def _make_conv(self) -> Callable:
+        if self.time_mode != "attn-only":
+            return partialclass(AE3DConv, video_kernel_size=self.video_kernel_size)
+        else:
+            return Conv2DWrapper
+
+    def _make_resblock(self) -> Callable:
+        if self.time_mode not in ["attn-only", "only-last-conv"]:
+            return partialclass(
+                VideoResBlock,
+                video_kernel_size=self.video_kernel_size,
+                alpha=self.alpha,
+                merge_strategy=self.merge_strategy
+            )
+        else:
+            return super()._make_resblock()
diff --git a/vista/vwm/modules/diffusionmodules/denoiser.py b/vista/vwm/modules/diffusionmodules/denoiser.py
new file mode 100644
index 0000000000000000000000000000000000000000..a513072bacd8d3f15a81300b4087da429a816408
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/denoiser.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from ...util import append_dims, instantiate_from_config
+from .denoiser_scaling import DenoiserScaling
+
+
+class Denoiser(nn.Module):
+    def __init__(self, scaling_config: dict, num_frames: int = 25):
+        super().__init__()
+        self.scaling: DenoiserScaling = instantiate_from_config(scaling_config)
+        self.num_frames = num_frames
+
+    def possibly_quantize_sigma(self, sigma: torch.Tensor) -> torch.Tensor:
+        return sigma
+
+    def possibly_quantize_c_noise(self, c_noise: torch.Tensor) -> torch.Tensor:
+        return c_noise
+
+    def forward(
+            self,
+            network: nn.Module,
+            noised_input: torch.Tensor,
+            sigma: torch.Tensor,
+            cond: dict,
+            cond_mask: torch.Tensor
+    ):
+        sigma = self.possibly_quantize_sigma(sigma)
+        sigma_shape = sigma.shape
+        sigma = append_dims(sigma, noised_input.ndim)
+        c_skip, c_out, c_in, c_noise = self.scaling(sigma)
+        c_noise = self.possibly_quantize_c_noise(c_noise.reshape(sigma_shape))
+        return (network(noised_input * c_in, c_noise, cond, cond_mask, self.num_frames) * c_out + noised_input * c_skip)
+
+
+class DiscreteDenoiser(Denoiser):
+    def __init__(
+            self,
+            scaling_config: dict,
+            num_idx: int,
+            discretization_config: dict,
+            do_append_zero: bool = False,
+            quantize_c_noise: bool = True,
+            flip: bool = True
+    ):
+        super().__init__(scaling_config)
+        sigmas = instantiate_from_config(discretization_config)(
+            num_idx, do_append_zero=do_append_zero, flip=flip
+        )
+        self.register_buffer("sigmas", sigmas)
+        self.quantize_c_noise = quantize_c_noise
+
+    def sigma_to_idx(self, sigma: torch.Tensor) -> torch.Tensor:
+        dists = sigma - self.sigmas[:, None]
+        return dists.abs().argmin(dim=0).view(sigma.shape)
+
+    def idx_to_sigma(self, idx: Union[torch.Tensor, int]) -> torch.Tensor:
+        return self.sigmas[idx]
+
+    def possibly_quantize_sigma(self, sigma: torch.Tensor) -> torch.Tensor:
+        return self.idx_to_sigma(self.sigma_to_idx(sigma))
+
+    def possibly_quantize_c_noise(self, c_noise: torch.Tensor) -> torch.Tensor:
+        if self.quantize_c_noise:
+            return self.sigma_to_idx(c_noise)
+        else:
+            return c_noise
diff --git a/vista/vwm/modules/diffusionmodules/denoiser_scaling.py b/vista/vwm/modules/diffusionmodules/denoiser_scaling.py
new file mode 100644
index 0000000000000000000000000000000000000000..90d2a9e0a534a338e0e829e605131d0d68e652bb
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/denoiser_scaling.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class DenoiserScaling(ABC):
+    @abstractmethod
+    def __call__(
+            self, sigma: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        pass
+
+
+class EDMScaling:
+    def __init__(self, sigma_data: float = 0.5):
+        self.sigma_data = sigma_data
+
+    def __call__(
+            self, sigma: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2)
+        c_out = sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+        c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+        c_noise = 0.25 * sigma.log()
+        return c_skip, c_out, c_in, c_noise
+
+
+class EpsScaling:
+    def __call__(
+            self, sigma: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = torch.ones_like(sigma, device=sigma.device)
+        c_out = -sigma
+        c_in = 1 / (sigma ** 2 + 1.0) ** 0.5
+        c_noise = sigma.clone()
+        return c_skip, c_out, c_in, c_noise
+
+
+class VScaling:
+    def __call__(
+            self, sigma: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = 1.0 / (sigma ** 2 + 1.0)
+        c_out = -sigma / (sigma ** 2 + 1.0) ** 0.5
+        c_in = 1.0 / (sigma ** 2 + 1.0) ** 0.5
+        c_noise = sigma.clone()
+        return c_skip, c_out, c_in, c_noise
+
+
+class VScalingWithEDMcNoise(DenoiserScaling):
+    def __call__(
+            self, sigma: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = 1.0 / (sigma ** 2 + 1.0)
+        c_out = -sigma / (sigma ** 2 + 1.0) ** 0.5
+        c_in = 1.0 / (sigma ** 2 + 1.0) ** 0.5
+        c_noise = 0.25 * sigma.log()
+        return c_skip, c_out, c_in, c_noise
diff --git a/vista/vwm/modules/diffusionmodules/discretizer.py b/vista/vwm/modules/diffusionmodules/discretizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa66edbc43096183db73edec5d937d21cba30235
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/discretizer.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+from abc import abstractmethod
+from functools import partial
+
+import numpy as np
+import torch
+
+from ...util import append_zero
+from .util import make_beta_schedule
+
+
+def generate_roughly_equally_spaced_steps(num_substeps: int, max_step: int) -> np.ndarray:
+    return np.linspace(max_step - 1, 0, num_substeps, endpoint=False, dtype=int)[::-1]
+
+
+class Discretization:
+    def __call__(self, n, do_append_zero=True, device="cpu", flip=False):
+        sigmas = self.get_sigmas(n, device=device)
+        sigmas = append_zero(sigmas) if do_append_zero else sigmas
+        return sigmas if not flip else torch.flip(sigmas, (0,))
+
+    @abstractmethod
+    def get_sigmas(self, n, device):
+        pass
+
+
+class EDMDiscretization(Discretization):
+    def __init__(self, sigma_min=0.002, sigma_max=80.0, rho=7.0):
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.rho = rho
+
+    def get_sigmas(self, n, device="cpu"):
+        ramp = torch.linspace(0, 1, n, device=device)
+        min_inv_rho = self.sigma_min ** (1 / self.rho)
+        max_inv_rho = self.sigma_max ** (1 / self.rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** self.rho
+        return sigmas
+
+
+class LegacyDDPMDiscretization(Discretization):
+    def __init__(self, linear_start=0.00085, linear_end=0.0120, num_timesteps=1000):
+        super().__init__()
+        self.num_timesteps = num_timesteps
+        betas = make_beta_schedule(
+            "scaled_linear", num_timesteps, linear_start=linear_start, linear_end=linear_end
+        )
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.to_torch = partial(torch.tensor, dtype=torch.float32)
+
+    def get_sigmas(self, n, device="cpu"):
+        if n < self.num_timesteps:
+            timesteps = generate_roughly_equally_spaced_steps(n, self.num_timesteps)
+            alphas_cumprod = self.alphas_cumprod[timesteps]
+        elif n == self.num_timesteps:
+            alphas_cumprod = self.alphas_cumprod
+        else:
+            raise ValueError
+
+        to_torch = partial(torch.tensor, dtype=torch.float32, device=device)
+        sigmas = to_torch((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        return torch.flip(sigmas, (0,))
diff --git a/vista/vwm/modules/diffusionmodules/guiders.py b/vista/vwm/modules/diffusionmodules/guiders.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e2c67ba8308fca587b3125600ecffd459473660
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/guiders.py
@@ -0,0 +1,120 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Literal, Optional, Union
+
+import torch
+from einops import rearrange, repeat
+
+from ...util import append_dims, default
+
+
+class Guider(ABC):
+    @abstractmethod
+    def __call__(self, x: torch.Tensor, sigma: float) -> torch.Tensor:
+        pass
+
+    def prepare_inputs(self, x, s, c, cond_mask, uc):
+        pass
+
+
+class VanillaCFG(Guider):
+    def __init__(self, scale: float):
+        self.scale = scale
+
+    def __call__(self, x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+        x_u, x_c = x.chunk(2)
+        x_pred = x_u + self.scale * (x_c - x_u)
+        return x_pred
+
+    def prepare_inputs(self, x, s, c, cond_mask, uc):
+        c_out = dict()
+        for k in c:
+            if k in ["vector", "crossattn", "concat"]:
+                c_out[k] = torch.cat((uc[k], c[k]), 0)
+            else:
+                assert c[k] == uc[k]
+                c_out[k] = c[k]
+        return torch.cat([x] * 2), torch.cat([s] * 2), c_out, torch.cat([cond_mask] * 2)
+
+
+class IdentityGuider(Guider):
+    def __call__(self, x: torch.Tensor, sigma: float) -> torch.Tensor:
+        return x
+
+    def prepare_inputs(self, x, s, c, cond_mask, uc):
+        c_out = dict()
+        for k in c:
+            c_out[k] = c[k]
+        return x, s, c_out, cond_mask
+
+
+class LinearPredictionGuider(Guider):
+    def __init__(
+            self,
+            num_frames: int = 25,
+            max_scale: float = 2.5,
+            min_scale: float = 1.0,
+            additional_cond_keys: Optional[Union[list[str], str]] = None
+    ):
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.num_frames = num_frames
+        self.scale = torch.linspace(min_scale, max_scale, num_frames).unsqueeze(0)
+
+        additional_cond_keys = default(additional_cond_keys, list())
+        if isinstance(additional_cond_keys, str):
+            additional_cond_keys = [additional_cond_keys]
+        self.additional_cond_keys = additional_cond_keys
+
+    def __call__(self, x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+        x_u, x_c = x.chunk(2)
+        x_u = rearrange(x_u, "(b t) ... -> b t ...", t=self.num_frames)
+        x_c = rearrange(x_c, "(b t) ... -> b t ...", t=self.num_frames)
+        scale = repeat(self.scale, "1 t -> b t", b=x_u.shape[0])
+        scale = append_dims(scale, x_u.ndim).to(x_u.device)
+        return rearrange(x_u + scale * (x_c - x_u), "b t ... -> (b t) ...")
+
+    def prepare_inputs(self, x, s, c, cond_mask, uc):
+        c_out = dict()
+        for k in c:
+            if k in ["vector", "crossattn", "concat"] + self.additional_cond_keys:
+                c_out[k] = torch.cat((uc[k], c[k]), 0)
+            else:
+                assert c[k] == uc[k]
+                c_out[k] = c[k]
+        return torch.cat([x] * 2), torch.cat([s] * 2), c_out, torch.cat([cond_mask] * 2)
+
+
+class TrianglePredictionGuider(LinearPredictionGuider):
+    def __init__(
+            self,
+            num_frames: int = 25,
+            max_scale: float = 2.5,
+            min_scale: float = 1.0,
+            period: float = 1.0,
+            period_fusing: Literal["mean", "multiply", "max"] = "max",
+            additional_cond_keys: Optional[Union[list[str], str]] = None
+    ):
+        super().__init__(num_frames, max_scale, min_scale, additional_cond_keys)
+        values = torch.linspace(0, 1, num_frames)
+        # constructs a triangle wave
+        if isinstance(period, float):
+            period = [period]
+
+        scales = list()
+        for p in period:
+            scales.append(self.triangle_wave(values, p))
+
+        if period_fusing == "mean":
+            scale = sum(scales) / len(period)
+        elif period_fusing == "multiply":
+            scale = torch.prod(torch.stack(scales), dim=0)
+        elif period_fusing == "max":
+            scale = torch.max(torch.stack(scales), dim=0).values
+        else:
+            raise NotImplementedError
+        self.scale = (scale * (max_scale - min_scale) + min_scale).unsqueeze(0)
+
+    def triangle_wave(self, values: torch.Tensor, period) -> torch.Tensor:
+        return 2 * (values / period - torch.floor(values / period + 0.5)).abs()
diff --git a/vista/vwm/modules/diffusionmodules/loss.py b/vista/vwm/modules/diffusionmodules/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff010d988e5d0e45f5ea50325a147349aaf1271b
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/loss.py
@@ -0,0 +1,150 @@
+from __future__ import annotations
+
+import random
+from typing import Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from vwm.modules.diffusionmodules.util import fourier_filter
+from vwm.modules.encoders.modules import GeneralConditioner
+from vwm.util import append_dims, instantiate_from_config
+
+from .denoiser import Denoiser
+
+
+class StandardDiffusionLoss(nn.Module):
+    def __init__(
+            self,
+            sigma_sampler_config: dict,
+            loss_weighting_config: dict,
+            loss_type: str = "l2",
+            use_additional_loss: bool = False,
+            offset_noise_level: float = 0.0,
+            additional_loss_weight: float = 0.0,
+            num_frames: int = 25,
+            replace_cond_frames: bool = False,
+            cond_frames_choices: Union[list, None] = None
+    ):
+        super().__init__()
+        assert loss_type in ["l2", "l1"]
+        self.loss_type = loss_type
+        self.use_additional_loss = use_additional_loss
+
+        self.sigma_sampler = instantiate_from_config(sigma_sampler_config)
+        self.loss_weighting = instantiate_from_config(loss_weighting_config)
+
+        self.offset_noise_level = offset_noise_level
+        self.additional_loss_weight = additional_loss_weight
+        self.num_frames = num_frames
+        self.replace_cond_frames = replace_cond_frames
+        self.cond_frames_choices = cond_frames_choices
+
+    def get_noised_input(
+            self,
+            sigmas_bc: torch.Tensor,
+            noise: torch.Tensor,
+            input: torch.Tensor
+    ) -> torch.Tensor:
+        noised_input = input + noise * sigmas_bc
+        return noised_input
+
+    def forward(
+            self,
+            network: nn.Module,
+            denoiser: Denoiser,
+            conditioner: GeneralConditioner,
+            input: torch.Tensor,
+            batch: dict
+    ) -> torch.Tensor:
+        cond = conditioner(batch)
+        return self._forward(network, denoiser, cond, input)
+
+    def _forward(
+            self,
+            network: nn.Module,
+            denoiser: Denoiser,
+            cond: dict,
+            input: torch.Tensor
+    ):
+        sigmas = self.sigma_sampler(input.shape[0]).to(input)
+        cond_mask = torch.zeros_like(sigmas)
+        if self.replace_cond_frames:
+            cond_mask = rearrange(cond_mask, "(b t) -> b t", t=self.num_frames)
+            for each_cond_mask in cond_mask:
+                assert len(self.cond_frames_choices[-1]) < self.num_frames
+                weights = [2 ** n for n in range(len(self.cond_frames_choices))]
+                cond_indices = random.choices(self.cond_frames_choices, weights=weights, k=1)[0]
+                if cond_indices:
+                    each_cond_mask[cond_indices] = 1
+            cond_mask = rearrange(cond_mask, "b t -> (b t)")
+        noise = torch.randn_like(input)
+        if self.offset_noise_level > 0.0:  # the entire channel is shifted together
+            offset_shape = (input.shape[0], input.shape[1])
+            # offset_shape = (input.shape[0] // self.num_frames, 1, input.shape[1])
+            rand_init = torch.randn(offset_shape, device=input.device)
+            # rand_init = repeat(rand_init, "b 1 c -> (b t) c", t=self.num_frames)
+            noise = noise + self.offset_noise_level * append_dims(rand_init, input.ndim)
+        if self.replace_cond_frames:
+            sigmas_bc = append_dims((1 - cond_mask) * sigmas, input.ndim)
+        else:
+            sigmas_bc = append_dims(sigmas, input.ndim)
+        noised_input = self.get_noised_input(sigmas_bc, noise, input)
+
+        model_output = denoiser(network, noised_input, sigmas, cond, cond_mask)
+        w = append_dims(self.loss_weighting(sigmas), input.ndim)
+
+        if self.replace_cond_frames:  # ignore mask predictions
+            predict = model_output * append_dims(1 - cond_mask, input.ndim) + input * append_dims(cond_mask, input.ndim)
+        else:
+            predict = model_output
+        return self.get_loss(predict, input, w)
+
+    def get_loss(self, predict, target, w):
+        if self.loss_type == "l2":
+            if self.use_additional_loss:
+                predict_seq = rearrange(predict, "(b t) ... -> b t ...", t=self.num_frames)
+                target_seq = rearrange(target, "(b t) ... -> b t ...", t=self.num_frames)
+                bs = target.shape[0] // self.num_frames
+                aux_loss = ((target_seq[:, 1:] - target_seq[:, :-1]) - (predict_seq[:, 1:] - predict_seq[:, :-1])) ** 2
+                tmp_h, tmp_w = aux_loss.shape[-2], aux_loss.shape[-1]
+                aux_loss = rearrange(aux_loss, "b t c h w -> b (t h w) c", c=4)
+                aux_w = F.normalize(aux_loss, p=2)
+                aux_w = rearrange(aux_w, "b (t h w) c -> b t c h w", t=self.num_frames - 1, h=tmp_h, w=tmp_w)
+                aux_w = 1 + torch.cat((torch.zeros(bs, 1, *aux_w.shape[2:]).to(aux_w), aux_w), dim=1)
+                aux_w = rearrange(aux_w, "b t ... -> (b t) ...").reshape(target.shape[0], -1)
+                predict_hf = fourier_filter(predict, scale=0.)
+                target_hf = fourier_filter(target, scale=0.)
+                hf_loss = torch.mean((w * (predict_hf - target_hf) ** 2).reshape(target.shape[0], -1), 1).mean()
+                return torch.mean(
+                    (w * (predict - target) ** 2).reshape(target.shape[0], -1) * aux_w.detach(), 1
+                ).mean() + self.additional_loss_weight * hf_loss
+            else:
+                return torch.mean(
+                    (w * (predict - target) ** 2).reshape(target.shape[0], -1), 1
+                )
+        elif self.loss_type == "l1":
+            if self.use_additional_loss:
+                predict_seq = rearrange(predict, "(b t) ... -> b t ...", t=self.num_frames)
+                target_seq = rearrange(target, "(b t) ... -> b t ...", t=self.num_frames)
+                bs = target.shape[0] // self.num_frames
+                aux_loss = ((target_seq[:, 1:] - target_seq[:, :-1]) - (predict_seq[:, 1:] - predict_seq[:, :-1])).abs()
+                tmp_h, tmp_w = aux_loss.shape[-2], aux_loss.shape[-1]
+                aux_loss = rearrange(aux_loss, "b t c h w -> b (t h w) c", c=4)
+                aux_w = F.normalize(aux_loss, p=1)
+                aux_w = rearrange(aux_w, "b (t h w) c -> b t c h w", t=self.num_frames - 1, h=tmp_h, w=tmp_w)
+                aux_w = 1 + torch.cat((torch.zeros(bs, 1, *aux_w.shape[2:]).to(aux_w), aux_w), dim=1)
+                aux_w = rearrange(aux_w, "b t ... -> (b t) ...").reshape(target.shape[0], -1)
+                predict_hf = fourier_filter(predict, scale=0.)
+                target_hf = fourier_filter(target, scale=0.)
+                hf_loss = torch.mean((w * (predict_hf - target_hf).abs()).reshape(target.shape[0], -1), 1).mean()
+                return torch.mean(
+                    (w * (predict - target).abs()).reshape(target.shape[0], -1) * aux_w.detach(), 1
+                ).mean() + self.additional_loss_weight * hf_loss
+            else:
+                return torch.mean(
+                    (w * (predict - target).abs()).reshape(target.shape[0], -1), 1
+                )
+        else:
+            raise NotImplementedError(f"Unknown loss type {self.loss_type}")
diff --git a/vista/vwm/modules/diffusionmodules/loss_weighting.py b/vista/vwm/modules/diffusionmodules/loss_weighting.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb14db9ba739b7cc9abc68c0527372acfdf1e7bb
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/loss_weighting.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class DiffusionLossWeighting(ABC):
+    @abstractmethod
+    def __call__(self, sigma: torch.Tensor) -> torch.Tensor:
+        pass
+
+
+class UnitWeighting(DiffusionLossWeighting):
+    def __call__(self, sigma: torch.Tensor) -> torch.Tensor:
+        return torch.ones_like(sigma, device=sigma.device)
+
+
+class EDMWeighting(DiffusionLossWeighting):
+    def __init__(self, sigma_data: float = 0.5):
+        self.sigma_data = sigma_data
+
+    def __call__(self, sigma: torch.Tensor) -> torch.Tensor:
+        return (sigma ** 2 + self.sigma_data ** 2) / (sigma * self.sigma_data) ** 2
+
+
+class VWeighting(EDMWeighting):
+    def __init__(self):
+        super().__init__(sigma_data=1.0)
+
+
+class EpsWeighting(DiffusionLossWeighting):
+    def __call__(self, sigma: torch.Tensor) -> torch.Tensor:
+        return sigma ** -2.0
diff --git a/vista/vwm/modules/diffusionmodules/model.py b/vista/vwm/modules/diffusionmodules/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9043e918bd187afb4c0dba785a723375e7660201
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/model.py
@@ -0,0 +1,694 @@
+# pytorch_diffusion + derived encoder decoder
+from __future__ import annotations
+
+import math
+from typing import Any, Callable, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from packaging import version
+
+try:
+    import xformers
+    import xformers.ops
+
+    XFORMERS_IS_AVAILABLE = True
+except:
+    XFORMERS_IS_AVAILABLE = False
+    print("No module `xformers`, processing without it")
+
+from ..attention import LinearAttention, MemoryEfficientCrossAttention
+
+
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+
+    assert len(timesteps.shape) == 1
+
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None]
+    emb = torch.cat((torch.sin(emb), torch.cos(emb)), dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = F.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+
+
+def Normalize(in_channels, num_groups=32):
+    return nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = F.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+class ResnetBlock(nn.Module):
+    def __init__(
+            self,
+            *,
+            in_channels,
+            out_channels=None,
+            conv_shortcut=False,
+            dropout,
+            temb_channels=512
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels > 0:
+            self.temb_proj = nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+
+
+class LinAttnBlock(LinearAttention):
+    """To match AttnBlock usage."""
+
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def attention(self, h_: torch.Tensor) -> torch.Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        b, c, h, w = q.shape
+        q, k, v = map(
+            lambda x: rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v)
+        )
+        h_ = F.scaled_dot_product_attention(q, k, v)  # scale is dim ** -0.5 per default
+        # compute attention
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+
+    def forward(self, x, **kwargs):
+        h_ = x
+        h_ = self.attention(h_)
+        h_ = self.proj_out(h_)
+        return x + h_
+
+
+class MemoryEfficientAttnBlock(nn.Module):
+    """
+    Uses xformers efficient implementation,
+    see https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223.
+
+    NOTE: this is a single-head self-attn operation.
+    """
+
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.attention_op: Optional[Any] = None
+
+    def attention(self, h_: torch.Tensor) -> torch.Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        B, C, H, W = q.shape
+        q, k, v = map(lambda x: rearrange(x, "b c h w -> b (h w) c"), (q, k, v))
+
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(B, t.shape[1], 1, C)
+            .permute(0, 2, 1, 3)
+            .reshape(B * 1, t.shape[1], C)
+            .contiguous(),
+            (q, k, v)
+        )
+        out = xformers.ops.memory_efficient_attention(
+            q, k, v, attn_bias=None, op=self.attention_op
+        )
+
+        out = (
+            out.unsqueeze(0)
+            .reshape(B, 1, out.shape[1], C)
+            .permute(0, 2, 1, 3)
+            .reshape(B, out.shape[1], C)
+        )
+        return rearrange(out, "b (h w) c -> b c h w", b=B, h=H, w=W, c=C)
+
+    def forward(self, x, **kwargs):
+        h_ = x
+        h_ = self.attention(h_)
+        h_ = self.proj_out(h_)
+        return x + h_
+
+
+class MemoryEfficientCrossAttentionWrapper(MemoryEfficientCrossAttention):
+    def forward(self, x, context=None, mask=None, **unused_kwargs):
+        _b, c, h, w = x.shape
+        x = rearrange(x, "b c h w -> b (h w) c")
+        out = super().forward(x, context=context, mask=mask)
+        out = rearrange(out, "b (h w) c -> b c h w", h=h, w=w, c=c)
+        return x + out
+
+
+def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
+    assert attn_type in [
+        "vanilla",
+        "vanilla-xformers",
+        "memory-efficient-cross-attn",
+        "linear",
+        "none"
+    ], f"attn_type `{attn_type}` unknown"
+    if version.parse(torch.__version__) < version.parse("2.0.0") and attn_type != "none":
+        assert XFORMERS_IS_AVAILABLE, (
+            f"We do not support vanilla attention in {torch.__version__} anymore as it is too expensive, "
+            f"please install xformers via e.g. `pip install xformers==0.0.16`"
+        )
+        attn_type = "vanilla-xformers"
+    print(f"Making attention of type `{attn_type}` with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        assert attn_kwargs is None
+        return AttnBlock(in_channels)
+    elif attn_type == "vanilla-xformers":
+        print(f"Building MemoryEfficientAttnBlock with {in_channels} in_channels...")
+        return MemoryEfficientAttnBlock(in_channels)
+    elif type == "memory-efficient-cross-attn":
+        attn_kwargs["query_dim"] = in_channels
+        return MemoryEfficientCrossAttentionWrapper(**attn_kwargs)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)
+
+
+class Model(nn.Module):
+    def __init__(
+            self,
+            *,
+            ch,
+            out_ch,
+            ch_mult=(1, 2, 4, 8),
+            num_res_blocks,
+            attn_resolutions,
+            dropout=0.0,
+            resamp_with_conv=True,
+            in_channels,
+            resolution,
+            use_timestep=True,
+            use_linear_attn=False,
+            attn_type="vanilla"
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = ch * 4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList(
+                [
+                    nn.Linear(self.ch, self.temb_ch),
+                    nn.Linear(self.temb_ch, self.temb_ch)
+                ]
+            )
+
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout
+        )
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            skip_in = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch * in_ch_mult[i_level]
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in + skip_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x, t=None, context=None):
+        # assert x.shape[2] == x.shape[3] == self.resolution
+        if context is not None:
+            # assume aligned context, cat along channel axis
+            x = torch.cat((x, context), dim=1)
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat((h, hs.pop()), dim=1), temb
+                )
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+    def get_last_layer(self):
+        return self.conv_out.weight
+
+
+class Encoder(nn.Module):
+    def __init__(
+            self,
+            *,
+            ch,
+            out_ch,
+            ch_mult=(1, 2, 4, 8),
+            num_res_blocks,
+            attn_resolutions,
+            dropout=0.0,
+            resamp_with_conv=True,
+            in_channels,
+            resolution,
+            z_channels,
+            double_z=True,
+            use_linear_attn=False,
+            attn_type="vanilla",
+            **ignore_kwargs
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout
+        )
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1
+        )
+
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+            self,
+            *,
+            ch,
+            out_ch,
+            ch_mult=(1, 2, 4, 8),
+            num_res_blocks,
+            attn_resolutions,
+            dropout=0.0,
+            resamp_with_conv=True,
+            in_channels,
+            resolution,
+            z_channels,
+            give_pre_end=False,
+            tanh_out=False,
+            use_linear_attn=False,
+            attn_type="vanilla",
+            **ignorekwargs
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        z_shape = (1, z_channels, curr_res, curr_res)
+        print(f"Working with z of shape {z_shape} = {np.prod(z_shape)} dimensions")
+
+        make_attn_cls = self._make_attn()
+        make_resblock_cls = self._make_resblock()
+        make_conv_cls = self._make_conv()
+
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = make_resblock_cls(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout
+        )
+        self.mid.attn_1 = make_attn_cls(block_in, attn_type=attn_type)
+        self.mid.block_2 = make_resblock_cls(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout
+        )
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    make_resblock_cls(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn_cls(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = make_conv_cls(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def _make_attn(self) -> Callable:
+        return make_attn
+
+    def _make_resblock(self) -> Callable:
+        return ResnetBlock
+
+    def _make_conv(self) -> Callable:
+        return nn.Conv2d
+
+    def get_last_layer(self, **kwargs):
+        return self.conv_out.weight
+
+    def forward(self, z, **kwargs):
+        # timestep embedding
+        temb = None
+
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h, temb, **kwargs)
+        h = self.mid.attn_1(h, **kwargs)
+        h = self.mid.block_2(h, temb, **kwargs)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb, **kwargs)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, **kwargs)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        if self.give_pre_end:
+            return h
+        else:
+            h = self.norm_out(h)
+            h = nonlinearity(h)
+            h = self.conv_out(h, **kwargs)
+            if self.tanh_out:
+                h = torch.tanh(h)
+            return h
diff --git a/vista/vwm/modules/diffusionmodules/openaimodel.py b/vista/vwm/modules/diffusionmodules/openaimodel.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd1a364837a4a2a58fc5b42671e19a20db311851
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/openaimodel.py
@@ -0,0 +1,289 @@
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import Iterable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.utils.checkpoint import checkpoint
+
+from ...modules.attention import SpatialTransformer
+from ...modules.video_attention import SpatialVideoTransformer
+from .util import avg_pool_nd, conv_nd, linear, normalization, timestep_embedding, zero_module
+
+
+class TimestepBlock(nn.Module):
+    """Any module where forward() takes timestep embeddings as a second argument."""
+
+    @abstractmethod
+    def forward(self, x: torch.Tensor, emb: torch.Tensor):
+        """Apply the module to `x` given `emb` timestep embeddings."""
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """A sequential module that passes timestep embeddings to the children that support it as an extra input."""
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            emb: torch.Tensor,
+            context: Optional[torch.Tensor] = None,
+            time_context: Optional[int] = None,
+            num_frames: Optional[int] = None
+    ):
+        from .video_model import VideoResBlock
+
+        for layer in self:
+            if isinstance(layer, VideoResBlock):
+                x = layer(x, emb, num_frames)
+            elif isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            elif isinstance(layer, SpatialVideoTransformer):
+                x = layer(x, context, time_context, num_frames)
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context)
+            else:
+                x = layer(x)
+        return x
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+            self,
+            channels: int,
+            use_conv: bool,
+            dims: int = 2,
+            out_channels: Optional[int] = None,
+            padding: int = 1,
+            third_up: bool = False,
+            kernel_size: int = 3,
+            scale_factor: int = 2
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        self.third_up = third_up
+        self.scale_factor = scale_factor
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, kernel_size, padding=padding)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            t_factor = 1 if not self.third_up else self.scale_factor
+            x = F.interpolate(
+                x,
+                (
+                    t_factor * x.shape[2],
+                    x.shape[3] * self.scale_factor,
+                    x.shape[4] * self.scale_factor
+                ),
+                mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=self.scale_factor, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+            self,
+            channels: int,
+            use_conv: bool,
+            dims: int = 2,
+            out_channels: Optional[int] = None,
+            padding: int = 1,
+            third_down: bool = False
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else ((1, 2, 2) if not third_down else (2, 2, 2))
+        if use_conv:
+            print(f"Building a downsample layer with {dims} dims")
+            print(
+                f"Settings are: \n in-chn: {self.channels}, out-chn: {self.out_channels}, "
+                f"kernel-size: 3, stride: {stride}, padding: {padding}"
+            )
+            self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+            self,
+            channels: int,
+            emb_channels: int,
+            dropout: float,
+            out_channels: Optional[int] = None,
+            use_conv: bool = False,
+            use_scale_shift_norm: bool = False,
+            dims: int = 2,
+            use_checkpoint: bool = False,
+            up: bool = False,
+            down: bool = False,
+            kernel_size: int = 3,
+            exchange_temb_dims: bool = False,
+            skip_t_emb: bool = False,
+            causal: bool = False
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.exchange_temb_dims = exchange_temb_dims
+
+        if isinstance(kernel_size, Iterable):
+            padding = [k // 2 for k in kernel_size]
+        else:
+            padding = kernel_size // 2
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding, causal=causal)
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.skip_t_emb = skip_t_emb
+        self.emb_out_channels = (
+            2 * self.out_channels if use_scale_shift_norm else self.out_channels
+        )
+        if self.skip_t_emb:
+            print(f"Skipping timestep embedding in {self.__class__.__name__}")
+            assert not self.use_scale_shift_norm
+            self.emb_layers = None
+            self.exchange_temb_dims = False
+        else:
+            self.emb_layers = nn.Sequential(
+                nn.SiLU(),
+                linear(emb_channels, self.emb_out_channels)
+            )
+
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, kernel_size, padding=padding, causal=causal)
+            )
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+
+        if self.use_checkpoint:
+            return checkpoint(self._forward, x, emb)
+        else:
+            return self._forward(x, emb)
+
+    def _forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+
+        if self.skip_t_emb:
+            emb_out = torch.zeros_like(h)
+        else:
+            emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            if self.exchange_temb_dims:
+                emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+class Timestep(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        return timestep_embedding(t, self.dim)
diff --git a/vista/vwm/modules/diffusionmodules/sampling.py b/vista/vwm/modules/diffusionmodules/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fd89016fb8daf512bd9b83fcd8f10adcde92fa7
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/sampling.py
@@ -0,0 +1,152 @@
+"""Partially ported from https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py."""
+from __future__ import annotations
+
+from typing import Union
+
+import rerun as rr
+import torch
+from omegaconf import ListConfig, OmegaConf
+from tqdm import tqdm
+
+from ...util import append_dims, default, instantiate_from_config
+from .sampling_utils import to_d
+
+
+class BaseDiffusionSampler:
+    def __init__(
+        self,
+        discretization_config: Union[dict, ListConfig, OmegaConf],
+        num_steps: Union[int, None] = None,
+        guider_config: Union[dict, ListConfig, OmegaConf, None] = None,
+        verbose: bool = False,
+        device: str = "cuda",
+    ):
+        self.num_steps = num_steps
+        self.discretization = instantiate_from_config(discretization_config)
+        self.guider = instantiate_from_config(guider_config)
+        self.verbose = verbose
+        self.device = device
+
+    def prepare_sampling_loop(self, x, cond, uc=None, num_steps=None):
+        sigmas = self.discretization(
+            self.num_steps if num_steps is None else num_steps, device=self.device
+        )
+        uc = default(uc, cond)
+
+        x *= torch.sqrt(1.0 + sigmas[0] ** 2)
+        num_sigmas = len(sigmas)
+
+        s_in = x.new_ones([x.shape[0]])
+        return x, s_in, sigmas, num_sigmas, cond, uc
+
+    def denoise(self, x, denoiser, sigma, cond, cond_mask, uc):
+        denoised = denoiser(*self.guider.prepare_inputs(x, sigma, cond, cond_mask, uc))
+        denoised = self.guider(denoised, sigma)
+        return denoised
+
+    def get_sigma_gen(self, num_sigmas):
+        sigma_generator = range(num_sigmas - 1)
+        if self.verbose:
+            print("#" * 30, " Sampling Setting ", "#" * 30)
+            print(f"Sampler: {self.__class__.__name__}")
+            print(f"Discretization: {self.discretization.__class__.__name__}")
+            print(f"Guider: {self.guider.__class__.__name__}")
+            sigma_generator = tqdm(
+                sigma_generator,
+                total=num_sigmas,
+                desc=f"Sampling with {self.__class__.__name__} for {num_sigmas} steps",
+            )
+        return sigma_generator
+
+
+class SingleStepDiffusionSampler(BaseDiffusionSampler):
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc, *args, **kwargs):
+        raise NotImplementedError
+
+    def euler_step(self, x, d, dt):
+        return x + dt * d
+
+
+class EulerEDMSampler(SingleStepDiffusionSampler):
+    def __init__(
+        self, s_churn=0.0, s_tmin=0.0, s_tmax=float("inf"), s_noise=1.0, *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.s_churn = s_churn
+        self.s_tmin = s_tmin
+        self.s_tmax = s_tmax
+        self.s_noise = s_noise
+
+    def sampler_step(
+        self, sigma, next_sigma, denoiser, x, cond, cond_mask=None, uc=None, gamma=0.0
+    ):
+        sigma_hat = sigma * (gamma + 1.0)
+        if gamma > 0:
+            eps = torch.randn_like(x) * self.s_noise
+            x = x + eps * append_dims(sigma_hat**2 - sigma**2, x.ndim) ** 0.5
+
+        denoised = self.denoise(x, denoiser, sigma_hat, cond, cond_mask, uc)
+        d = to_d(x, sigma_hat, denoised)
+        dt = append_dims(next_sigma - sigma_hat, x.ndim)
+
+        euler_step = self.euler_step(x, d, dt)
+        return euler_step
+
+    def __call__(
+        self,
+        denoiser,
+        x,  # x is randn
+        cond,
+        uc=None,
+        cond_frame=None,
+        cond_mask=None,
+        num_steps=None,
+        num_sequence=0,
+        log_queue=None,
+    ):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+        replace_cond_frames = cond_mask is not None and cond_mask.any()
+
+        for i in tqdm(self.get_sigma_gen(num_sigmas), "Diffusion steps"):
+            if replace_cond_frames:
+                x = x * append_dims(1 - cond_mask, x.ndim) + cond_frame * append_dims(
+                    cond_mask, cond_frame.ndim
+                )
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                if self.s_tmin <= sigmas[i] <= self.s_tmax
+                else 0.0
+            )
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                cond_mask,
+                uc,
+                gamma,
+            )
+
+            log_queue.put(
+                (
+                    f"diffusion_{num_sequence}",
+                    rr.Tensor(x.numpy(force=True)),
+                    [
+                        ("frame_id", 0),
+                        ("diffusion", i),
+                        (
+                            "combined",
+                            2 * num_sequence + (i * 1.0 / num_sigmas),
+                        ),
+                    ],
+                )
+            )
+
+        if replace_cond_frames:
+            x = x * append_dims(1 - cond_mask, x.ndim) + cond_frame * append_dims(
+                cond_mask, cond_frame.ndim
+            )
+        return x
diff --git a/vista/vwm/modules/diffusionmodules/sampling_utils.py b/vista/vwm/modules/diffusionmodules/sampling_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7021a95991470ac762ec97a41778d5233cce2657
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/sampling_utils.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import torch
+from scipy import integrate
+
+from ...util import append_dims
+
+
+def apply_cfg_with_rescale(pos, neg, scale, rescale=0.7):
+    # apply regular classifier-free guidance
+    cfg = neg + scale * (pos - neg)
+    # calculate standard deviations
+    std_pos = pos.std([1, 2, 3], keepdim=True)
+    std_cfg = cfg.std([1, 2, 3], keepdim=True)
+    # apply guidance rescale with fused operations
+    factor = std_pos / std_cfg
+    factor = rescale * factor + (1.0 - rescale)
+    return cfg * factor
+
+
+def linear_multistep_coeff(order, t, i, j, epsrel=1e-4):
+    if order - 1 > i:
+        raise ValueError(f"Order {order} too high for step {i}")
+
+    def fn(tau):
+        prod = 1.0
+        for k in range(order):
+            if j == k:
+                continue
+            prod *= (tau - t[i - k]) / (t[i - j] - t[i - k])
+        return prod
+
+    return integrate.quad(fn, t[i], t[i + 1], epsrel=epsrel)[0]
+
+
+def get_ancestral_step(sigma_from, sigma_to, eta=1.0):
+    if not eta:
+        return sigma_to, 0.0
+    else:
+        sigma_up = torch.minimum(
+            sigma_to,
+            eta * (sigma_to ** 2 * (sigma_from ** 2 - sigma_to ** 2) / sigma_from ** 2) ** 0.5
+        )
+        sigma_down = (sigma_to ** 2 - sigma_up ** 2) ** 0.5
+        return sigma_down, sigma_up
+
+
+def to_d(x, sigma, denoised):
+    return (x - denoised) / append_dims(sigma, x.ndim)
+
+
+def to_neg_log_sigma(sigma):
+    return sigma.log().neg()
+
+
+def to_sigma(neg_log_sigma):
+    return neg_log_sigma.neg().exp()
diff --git a/vista/vwm/modules/diffusionmodules/sigma_sampling.py b/vista/vwm/modules/diffusionmodules/sigma_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c79ae9881627386aafa3ae616c9b7398ed520cf
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/sigma_sampling.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+import torch
+from einops import repeat
+from vwm.util import default, instantiate_from_config
+
+
+class EDMSampling:
+    def __init__(self, p_mean=-1.2, p_std=1.2, num_frames=25):
+        self.p_mean = p_mean
+        self.p_std = p_std
+        self.num_frames = num_frames
+
+    def __call__(self, n_samples, rand=None):
+        bs = n_samples // self.num_frames
+        rand_init = torch.randn((bs,))[..., None]
+        rand_init = repeat(rand_init, "b 1 -> (b t)", t=self.num_frames)
+        rand = default(rand, rand_init)
+        log_sigma = self.p_mean + self.p_std * rand
+        return log_sigma.exp()
+
+
+class DiscreteSampling:
+    def __init__(self, discretization_config, num_idx, do_append_zero=False, flip=True, num_frames=25):
+        self.num_idx = num_idx
+        self.sigmas = instantiate_from_config(discretization_config)(
+            num_idx, do_append_zero=do_append_zero, flip=flip
+        )
+        self.num_frames = num_frames
+
+    def idx_to_sigma(self, idx):
+        return self.sigmas[idx]
+
+    def __call__(self, n_samples, rand=None):
+        bs = n_samples // self.num_frames
+        rand_init = torch.randint(0, self.num_idx, (bs,))[..., None]
+        rand_init = repeat(rand_init, "b 1 -> (b t)", t=self.num_frames)
+        idx = default(rand, rand_init)
+        return self.idx_to_sigma(idx)
diff --git a/vista/vwm/modules/diffusionmodules/util.py b/vista/vwm/modules/diffusionmodules/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..11eaa5e1aa07cb21bb970007cff7b391c8dfad34
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/util.py
@@ -0,0 +1,307 @@
+"""
+Partially adopted from
+https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+and
+https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+and
+https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py.
+"""
+from __future__ import annotations
+
+import math
+from typing import Iterable
+
+import torch
+import torch.fft as fft  # differentiable
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+
+def fourier_filter(x, scale, d_s=0.25):
+    dtype = x.dtype
+    x = x.type(torch.float32)
+    # FFT
+    x_freq = fft.fftn(x, dim=(-2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-2, -1))
+
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W)).cuda()
+
+    for h in range(H):
+        for w in range(W):
+            d_square = (2 * h / H - 1) ** 2 + (2 * w / W - 1) ** 2
+            if d_square <= 2 * d_s:
+                mask[..., h, w] = scale
+
+    x_freq = x_freq * mask
+
+    # IFFT
+    x_freq = fft.ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = fft.ifftn(x_freq, dim=(-2, -1)).real
+
+    x_filtered = x_filtered.type(dtype)
+    return x_filtered
+
+
+def fourier_filter_3d(x, scale, num_frames, d_s=0.25, d_t=0.25):
+    dtype = x.dtype
+    x = x.type(torch.float32)
+    x_ = rearrange(x, "(b t) c h w -> b c t h w", t=num_frames)
+
+    # FFT
+    x_freq = fft.fftn(x_, dim=(-3, -2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
+
+    B, C, T, H, W = x_freq.shape
+    mask = torch.ones((B, C, T, H, W)).cuda()
+
+    for t in range(T):
+        for h in range(H):
+            for w in range(W):
+                d_square = (d_s / d_t * (2 * t / T - 1)) ** 2 + (2 * h / H - 1) ** 2 + (2 * w / W - 1) ** 2
+                if d_square <= 2 * d_s:
+                    mask[..., t, h, w] = scale
+
+    x_freq = x_freq * mask
+
+    # IFFT
+    x_freq = fft.ifftshift(x_freq, dim=(-3, -2, -1))
+    x_filtered = fft.ifftn(x_freq, dim=(-3, -2, -1)).real
+
+    x_filtered = rearrange(x_filtered, "b c t h w -> (b t) c h w")
+    x_filtered = x_filtered.type(dtype)
+    return x_filtered
+
+
+def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2):
+    if schedule == "linear":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+        return betas.numpy()
+    if schedule == "scaled_linear":
+        betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        return betas.numpy()
+    else:
+        raise NotImplementedError(f"Unknown schedule: {schedule}")
+
+
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        ctx.gpu_autocast_kwargs = {
+            "enabled": torch.is_autocast_enabled(),
+            "dtype": torch.get_autocast_gpu_dtype(),
+            "cache_enabled": torch.is_autocast_cache_enabled()
+        }
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_params = [x.requires_grad_(True) for x in ctx.input_params]
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad(), torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
+            # fixes a bug where the first op in run_function modifies the Tensor storage in place,
+            # which is not allowed for detach()'d Tensors
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+
+
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element. These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+
+    if repeat_only:
+        embedding = repeat(timesteps, "b -> b d", d=dim)
+    else:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat((torch.cos(args), torch.sin(args)), dim=-1)
+        if dim % 2:
+            embedding = torch.cat((embedding, torch.zeros_like(embedding[:, :1])), dim=-1)
+    return embedding
+
+
+def zero_module(module):
+    """Zero out the parameters of a module and return it."""
+
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def scale_module(module, scale):
+    """Scale the parameters of a module and return it."""
+
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+
+
+def mean_flat(tensor):
+    """Take the mean over all non-batch dimensions."""
+
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+
+    :param channels: number of input channels.
+
+    :return: nn.Module for normalization.
+    """
+
+    return GroupNorm32(32, channels)
+
+
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+
+
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+
+
+class CausalConv3d(nn.Conv3d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=0)
+
+        # make causal padding
+        assert isinstance(kernel_size, Iterable) and len(kernel_size) == 3 and kernel_size[-1] == kernel_size[-2]
+        temporal_padding = [kernel_size[0] - 1, 0]  # causal padding on temporal dimension
+        spatial_padding = [kernel_size[-1] // 2] * 4  # keep padding on spatial dimension
+        causal_padding = tuple(spatial_padding + temporal_padding)  # starting from the last dimension
+        self.causal_padding = causal_padding
+
+    def forward(self, x):
+        x = F.pad(x, self.causal_padding)
+        x = super().forward(x)
+        return x
+
+
+def conv_nd(dims, *args, causal=False, **kwargs):
+    """Create a 1D, 2D, or 3D convolution module."""
+
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        if causal:
+            return CausalConv3d(*args, **kwargs)
+        else:
+            return nn.Conv3d(*args, **kwargs)
+    else:
+        raise ValueError(f"Unsupported dimensions: {dims}")
+
+
+def linear(*args, **kwargs):
+    """Create a linear module."""
+
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """Create a 1D, 2D, or 3D average pooling module."""
+
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    else:
+        raise ValueError(f"Unsupported dimensions: {dims}")
+
+
+class AlphaBlender(nn.Module):
+    strategies = ["learned", "fixed", "learned_with_images"]
+
+    def __init__(
+            self,
+            alpha: float,
+            merge_strategy: str,
+            rearrange_pattern: str
+    ):
+        super().__init__()
+        self.merge_strategy = merge_strategy
+        self.rearrange_pattern = rearrange_pattern
+
+        assert merge_strategy in self.strategies, f"merge_strategy needs to be in {self.strategies}"
+
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned" or self.merge_strategy == "learned_with_images":
+            self.register_parameter("mix_factor", torch.nn.Parameter(torch.Tensor([alpha])))
+        else:
+            raise ValueError(f"Unknown merge strategy {self.merge_strategy}")
+
+    def get_alpha(self) -> torch.Tensor:
+        if self.merge_strategy == "fixed":
+            alpha = self.mix_factor
+        elif self.merge_strategy == "learned":
+            alpha = torch.sigmoid(self.mix_factor)
+        elif self.merge_strategy == "learned_with_images":
+            alpha = rearrange(torch.sigmoid(self.mix_factor), "... -> ... 1")
+            alpha = rearrange(alpha, self.rearrange_pattern)
+        else:
+            raise NotImplementedError
+        return alpha
+
+    def forward(
+            self,
+            x_spatial: torch.Tensor,
+            x_temporal: torch.Tensor
+    ) -> torch.Tensor:
+        alpha = self.get_alpha()
+        x = alpha.to(x_spatial.dtype) * x_spatial + (1.0 - alpha).to(x_spatial.dtype) * x_temporal
+        return x
diff --git a/vista/vwm/modules/diffusionmodules/video_model.py b/vista/vwm/modules/diffusionmodules/video_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..980390f715e5780f7821516b6a930d65222a8594
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/video_model.py
@@ -0,0 +1,505 @@
+from __future__ import annotations
+
+from typing import Union
+
+from ...util import default, repeat_as_img_seq
+from ..video_attention import SpatialVideoTransformer
+from .openaimodel import *
+from .util import AlphaBlender
+
+
+class VideoResBlock(ResBlock):
+    def __init__(
+            self,
+            channels: int,
+            emb_channels: int,
+            dropout: float,
+            video_kernel_size: Union[int, list[int]] = 3,
+            merge_strategy: str = "fixed",
+            merge_factor: float = 0.5,
+            out_channels: Optional[int] = None,
+            use_conv: bool = False,
+            use_scale_shift_norm: bool = False,
+            dims: int = 2,
+            use_checkpoint: bool = False,
+            up: bool = False,
+            down: bool = False
+    ):
+        super().__init__(
+            channels,
+            emb_channels,
+            dropout,
+            out_channels=out_channels,
+            use_conv=use_conv,
+            use_scale_shift_norm=use_scale_shift_norm,
+            dims=dims,
+            use_checkpoint=use_checkpoint,
+            up=up,
+            down=down
+        )
+        self.time_stack = ResBlock(
+            default(out_channels, channels),
+            emb_channels,
+            dropout=dropout,
+            dims=3,
+            out_channels=default(out_channels, channels),
+            use_scale_shift_norm=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            kernel_size=video_kernel_size,
+            use_checkpoint=use_checkpoint,
+            exchange_temb_dims=True,
+            causal=False
+        )
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor,
+            merge_strategy=merge_strategy,
+            rearrange_pattern="b t -> b 1 t 1 1"
+        )
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            emb: torch.Tensor,
+            num_frames: int
+    ) -> torch.Tensor:
+        x = super().forward(x, emb)
+
+        x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=num_frames)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=num_frames)
+
+        x = self.time_stack(
+            x, rearrange(emb, "(b t) ... -> b t ...", t=num_frames)
+        )
+        x = self.time_mixer(x_spatial=x_mix, x_temporal=x)
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+
+
+class VideoUNet(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            model_channels: int,
+            out_channels: int,
+            num_res_blocks: int,
+            attention_resolutions: int,
+            dropout: float = 0.0,
+            channel_mult: list[int] = (1, 2, 4, 8),
+            conv_resample: bool = True,
+            dims: int = 2,
+            num_classes: Optional[int] = None,
+            use_checkpoint: bool = False,
+            num_heads: int = -1,
+            num_head_channels: int = -1,
+            num_heads_upsample: int = -1,
+            use_scale_shift_norm: bool = False,
+            resblock_updown: bool = False,
+            transformer_depth: Union[list[int], int] = 1,
+            transformer_depth_middle: Optional[int] = None,
+            context_dim: Optional[int] = None,
+            time_downup: bool = False,
+            time_context_dim: Optional[int] = None,
+            extra_ff_mix_layer: bool = False,
+            use_spatial_context: bool = False,
+            merge_strategy: str = "learned_with_images",
+            merge_factor: float = 0.5,
+            spatial_transformer_attn_type: str = "softmax",
+            video_kernel_size: Union[int, list[int]] = 3,
+            use_linear_in_transformer: bool = False,
+            adm_in_channels: Optional[int] = None,
+            disable_temporal_crossattention: bool = False,
+            max_ddpm_temb_period: int = 10000,
+            add_lora: bool = False,
+            action_control: bool = False
+    ):
+        super().__init__()
+        assert context_dim is not None
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert num_head_channels != -1
+
+        if num_head_channels == -1:
+            assert num_heads != -1
+
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        transformer_depth_middle = default(
+            transformer_depth_middle, transformer_depth[-1]
+        )
+
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim)
+        )
+        self.cond_time_stack_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim)
+        )
+
+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "timestep":
+                self.label_emb = nn.Sequential(
+                    Timestep(model_channels),
+                    nn.Sequential(
+                        linear(model_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim)
+                    )
+                )
+            elif self.num_classes == "sequential":  # this way
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim)
+                    )
+                )
+            else:
+                raise ValueError
+
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+
+        def get_attention_layer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=1,
+                context_dim=None,
+                use_checkpoint=False,
+                disabled_sa=False,
+                add_lora=False,
+                action_control=False
+        ):
+            return SpatialVideoTransformer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=depth,
+                context_dim=context_dim,
+                time_context_dim=time_context_dim,
+                dropout=dropout,
+                ff_in=extra_ff_mix_layer,
+                use_spatial_context=use_spatial_context,
+                merge_strategy=merge_strategy,
+                merge_factor=merge_factor,
+                use_checkpoint=use_checkpoint,
+                use_linear=use_linear_in_transformer,
+                attn_mode=spatial_transformer_attn_type,
+                disable_self_attn=disabled_sa,
+                disable_temporal_crossattention=disable_temporal_crossattention,
+                max_time_embed_period=max_ddpm_temb_period,
+                add_lora=add_lora,
+                action_control=action_control
+            )
+
+        def get_resblock(
+                merge_factor,
+                merge_strategy,
+                video_kernel_size,
+                ch,
+                time_embed_dim,
+                dropout,
+                out_ch,
+                dims,
+                use_checkpoint,
+                use_scale_shift_norm,
+                down=False,
+                up=False
+        ):
+            return VideoResBlock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                channels=ch,
+                emb_channels=time_embed_dim,
+                dropout=dropout,
+                out_channels=out_ch,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+                down=down,
+                up=up
+            )
+
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    get_resblock(
+                        merge_factor=merge_factor,
+                        merge_strategy=merge_strategy,
+                        video_kernel_size=video_kernel_size,
+                        ch=ch,
+                        time_embed_dim=time_embed_dim,
+                        dropout=dropout,
+                        out_ch=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+
+                    layers.append(
+                        get_attention_layer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth[level],
+                            context_dim=context_dim,
+                            use_checkpoint=use_checkpoint,
+                            disabled_sa=False,
+                            add_lora=add_lora,
+                            action_control=action_control
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                ds *= 2
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        get_resblock(
+                            merge_factor=merge_factor,
+                            merge_strategy=merge_strategy,
+                            video_kernel_size=video_kernel_size,
+                            ch=ch,
+                            time_embed_dim=time_embed_dim,
+                            dropout=dropout,
+                            out_ch=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True
+                        )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch, third_down=time_downup)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+
+        self.middle_block = TimestepEmbedSequential(
+            get_resblock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                ch=ch,
+                time_embed_dim=time_embed_dim,
+                out_ch=None,
+                dropout=dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm
+            ),
+            get_attention_layer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth_middle,
+                context_dim=context_dim,
+                use_checkpoint=use_checkpoint,
+                add_lora=add_lora,
+                action_control=action_control
+            ),
+            get_resblock(
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
+                video_kernel_size=video_kernel_size,
+                ch=ch,
+                out_ch=None,
+                time_embed_dim=time_embed_dim,
+                dropout=dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm
+            )
+        )
+        self._feature_size += ch
+
+        self.output_blocks = nn.ModuleList(list())
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    get_resblock(
+                        merge_factor=merge_factor,
+                        merge_strategy=merge_strategy,
+                        video_kernel_size=video_kernel_size,
+                        ch=ch + ich,
+                        time_embed_dim=time_embed_dim,
+                        dropout=dropout,
+                        out_ch=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+
+                    layers.append(
+                        get_attention_layer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth[level],
+                            context_dim=context_dim,
+                            use_checkpoint=use_checkpoint,
+                            disabled_sa=False,
+                            add_lora=add_lora,
+                            action_control=action_control
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    ds //= 2
+                    layers.append(
+                        get_resblock(
+                            merge_factor=merge_factor,
+                            merge_strategy=merge_strategy,
+                            video_kernel_size=video_kernel_size,
+                            ch=ch,
+                            time_embed_dim=time_embed_dim,
+                            dropout=dropout,
+                            out_ch=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch, third_up=time_downup)
+                    )
+
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(
+                conv_nd(dims, model_channels, out_channels, 3, padding=1)
+            )
+        )
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            timesteps: torch.Tensor,
+            context: Optional[torch.Tensor] = None,
+            y: Optional[torch.Tensor] = None,
+            time_context: Optional[torch.Tensor] = None,
+            cond_mask: Optional[torch.Tensor] = None,
+            num_frames: Optional[int] = None
+    ):
+        assert (y is not None) == (
+                self.num_classes is not None
+        ), "Must specify y if and only if the model is class-conditional"
+        hs = list()
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        if cond_mask is not None and cond_mask.any():
+            cond_mask_ = cond_mask[..., None].float()
+            emb = self.cond_time_stack_embed(t_emb) * cond_mask_ + self.time_embed(t_emb) * (1 - cond_mask_)
+        else:
+            emb = self.time_embed(t_emb)
+
+        if num_frames > 1 and context.shape[0] != x.shape[0]:
+            assert context.shape[0] == x.shape[0] // num_frames, f"{context.shape} {x.shape}"
+            context = repeat_as_img_seq(context, num_frames)
+
+        if self.num_classes is not None:
+            if num_frames > 1 and y.shape[0] != x.shape[0]:
+                assert y.shape[0] == x.shape[0] // num_frames, f"{y.shape} {x.shape}"
+                y = repeat_as_img_seq(y, num_frames)
+            emb = emb + self.label_emb(y)
+
+        h = x
+        for module in self.input_blocks:
+            h = module(
+                h,
+                emb,
+                context=context,
+                time_context=time_context,
+                num_frames=num_frames
+            )
+            hs.append(h)
+
+        h = self.middle_block(
+            h,
+            emb,
+            context=context,
+            time_context=time_context,
+            num_frames=num_frames
+        )
+
+        for module in self.output_blocks:
+            h = torch.cat((h, hs.pop()), dim=1)
+            h = module(
+                h,
+                emb,
+                context=context,
+                time_context=time_context,
+                num_frames=num_frames
+            )
+
+        h = h.type(x.dtype)
+        return self.out(h)
diff --git a/vista/vwm/modules/diffusionmodules/wrappers.py b/vista/vwm/modules/diffusionmodules/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..98d08c5953704879720f2b35a0ea2bb3fdc9d2b5
--- /dev/null
+++ b/vista/vwm/modules/diffusionmodules/wrappers.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+from packaging import version
+
+from ...util import repeat_as_img_seq
+
+OPENAIUNETWRAPPER = "vista.vwm.modules.diffusionmodules.wrappers.OpenAIWrapper"
+
+
+class IdentityWrapper(nn.Module):
+    def __init__(self, diffusion_model, compile_model: bool = False):
+        super().__init__()
+        compile = (
+            torch.compile
+            if version.parse(torch.__version__) >= version.parse("2.0.0") and compile_model
+            else lambda x: x
+        )
+        self.diffusion_model = compile(diffusion_model)
+
+    def forward(self, *args, **kwargs):
+        return self.diffusion_model(*args, **kwargs)
+
+
+class OpenAIWrapper(IdentityWrapper):
+    def forward(
+            self, x: torch.Tensor, t: torch.Tensor, c: dict, cond_mask: torch.Tensor, num_frames: int, **kwargs
+    ) -> torch.Tensor:
+        if "concat" in c and num_frames > 1 and c["concat"].shape[0] != x.shape[0]:
+            assert c["concat"].shape[0] == x.shape[0] // num_frames, f"{c['concat'].shape} {x.shape}"
+            c["concat"] = repeat_as_img_seq(c["concat"], num_frames)
+        x = torch.cat((x, c.get("concat", torch.Tensor(list()).type_as(x))), dim=1)
+        return self.diffusion_model(
+            x,
+            timesteps=t,
+            context=c.get("crossattn", None),
+            y=c.get("vector", None),
+            cond_mask=cond_mask,
+            num_frames=num_frames,
+            **kwargs
+        )
diff --git a/vista/vwm/modules/distributions/distributions.py b/vista/vwm/modules/distributions/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..845ba58a8937cf36a9fc137789840897b1e5752c
--- /dev/null
+++ b/vista/vwm/modules/distributions/distributions.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+import numpy as np
+import torch
+
+
+class AbstractDistribution:
+    def sample(self):
+        raise NotImplementedError
+
+    def mode(self):
+        raise NotImplementedError
+
+
+class DiracDistribution(AbstractDistribution):
+    def __init__(self, value):
+        self.value = value
+
+    def sample(self):
+        return self.value
+
+    def mode(self):
+        return self.value
+
+
+class DiagonalGaussianDistribution:
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        return x
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3]
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean,
+                              2) / other.var + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3]
+                )
+
+    def nll(self, sample, dims=[1, 2, 3]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            return 0.5 * torch.sum(
+                np.log(2.0 * np.pi) + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+                dim=dims
+            )
+
+    def mode(self):
+        return self.mean
diff --git a/vista/vwm/modules/ema.py b/vista/vwm/modules/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..60ff6b0d6f4548828c3ad2f5e76391c2fe127615
--- /dev/null
+++ b/vista/vwm/modules/ema.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+import torch
+from torch import nn
+
+
+class LitEma(nn.Module):
+    def __init__(self, model, decay=0.9999, use_num_upates=True):
+        super().__init__()
+        if decay < 0.0 or decay > 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+        else:
+            self.m_name2s_name = dict()
+            self.register_buffer(
+                "decay",
+                torch.tensor(decay, dtype=torch.float32)
+            )
+            self.register_buffer(
+                "num_updates",
+                torch.tensor(0, dtype=torch.int)
+                if use_num_upates
+                else torch.tensor(-1, dtype=torch.int)
+            )
+
+            for name, p in model.named_parameters():
+                if p.requires_grad:
+                    # remove as '.'-character is not allowed in buffers
+                    s_name = name.replace(".", "")
+                    self.m_name2s_name.update({name: s_name})
+                    self.register_buffer(s_name, p.clone().detach().data)
+
+            self.collected_params = list()
+
+    def reset_num_updates(self):
+        del self.num_updates
+        self.register_buffer("num_updates", torch.tensor(0, dtype=torch.int))
+
+    def forward(self, model):
+        decay = self.decay
+
+        if self.num_updates >= 0:
+            self.num_updates += 1
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
+
+        with torch.no_grad():
+            m_param = dict(model.named_parameters())
+            shadow_params = dict(self.named_buffers())
+
+            for key in m_param:
+                if m_param[key].requires_grad:
+                    sname = self.m_name2s_name[key]
+                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+                    shadow_params[sname].sub_((1.0 - decay) * (shadow_params[sname] - m_param[key]))
+                else:
+                    assert key not in self.m_name2s_name
+
+    def copy_to(self, model):
+        m_param = dict(model.named_parameters())
+        shadow_params = dict(self.named_buffers())
+        for key in m_param:
+            if m_param[key].requires_grad:
+                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
+            else:
+                assert key not in self.m_name2s_name
+
+    def store(self, parameters):
+        """
+        Save the current parameters for restoring later.
+
+        Args:
+        ----
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be temporarily stored.
+
+        """
+
+        self.collected_params = [param.clone() for param in parameters]
+
+    def restore(self, parameters):
+        """
+        Restore the parameters stored with the `store` method.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before the `copy_to` method.
+        After validation (or model saving), use this to restore the former parameters.
+
+        Args:
+        ----
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be updated with the stored parameters.
+
+        """
+
+        for c_param, param in zip(self.collected_params, parameters):
+            param.data.copy_(c_param.data)
diff --git a/vista/vwm/modules/encoders/modules.py b/vista/vwm/modules/encoders/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8613e8eb7fcee2f2ce567190d93ee2eca83b00f
--- /dev/null
+++ b/vista/vwm/modules/encoders/modules.py
@@ -0,0 +1,517 @@
+from __future__ import annotations
+
+import math
+from contextlib import nullcontext
+from typing import Optional, Union
+
+import kornia
+import numpy as np
+import open_clip
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from omegaconf import ListConfig
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...util import (
+    append_dims,
+    autocast,
+    count_params,
+    default,
+    disabled_train,
+    expand_dims_like,
+    instantiate_from_config,
+)
+from ..diffusionmodules.openaimodel import Timestep
+
+
+class AbstractEmbModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._is_trainable = None
+        self._ucg_rate = None
+        self._input_key = None
+
+    @property
+    def is_trainable(self) -> bool:
+        return self._is_trainable
+
+    @property
+    def ucg_rate(self) -> Union[float, torch.Tensor]:
+        return self._ucg_rate
+
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+
+    @is_trainable.setter
+    def is_trainable(self, value: bool):
+        self._is_trainable = value
+
+    @ucg_rate.setter
+    def ucg_rate(self, value: Union[float, torch.Tensor]):
+        self._ucg_rate = value
+
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+
+    @is_trainable.deleter
+    def is_trainable(self):
+        del self._is_trainable
+
+    @ucg_rate.deleter
+    def ucg_rate(self):
+        del self._ucg_rate
+
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+
+
+class GeneralConditioner(nn.Module):
+    OUTPUT_DIM2KEYS = {2: "vector", 3: "crossattn", 4: "concat", 5: "concat"}
+    KEY2CATDIM = {"vector": 1, "crossattn": 2, "concat": 1}
+
+    def __init__(self, emb_models: Union[list, ListConfig]):
+        super().__init__()
+        embedders = list()
+        for n, embconfig in enumerate(emb_models):
+            embedder = instantiate_from_config(embconfig)
+            assert isinstance(
+                embedder, AbstractEmbModel
+            ), f"Embedder model {embedder.__class__.__name__} has to inherit from AbstractEmbModel"
+            embedder.is_trainable = embconfig.get("is_trainable", False)
+            embedder.ucg_rate = embconfig.get("ucg_rate", 0.0)
+            if not embedder.is_trainable:
+                embedder.train = disabled_train
+                for param in embedder.parameters():
+                    param.requires_grad = False
+                embedder.eval()
+            print(
+                f"Initialized embedder #{n}: {embedder.__class__.__name__} "
+                f"with {count_params(embedder, False)} params. Trainable: {embedder.is_trainable}"
+            )
+
+            if "input_key" in embconfig:
+                embedder.input_key = embconfig["input_key"]
+            elif "input_keys" in embconfig:
+                embedder.input_keys = embconfig["input_keys"]
+            else:
+                raise KeyError(f"Need either `input_key` or `input_keys` for embedder {embedder.__class__.__name__}")
+
+            embedder.legacy_ucg_val = embconfig.get("legacy_ucg_value", None)
+            if embedder.legacy_ucg_val is not None:
+                embedder.ucg_prng = np.random.RandomState()
+
+            embedders.append(embedder)
+        self.embedders = nn.ModuleList(embedders)
+
+    def possibly_get_ucg_val(self, embedder: AbstractEmbModel, batch: dict) -> dict:
+        assert embedder.legacy_ucg_val is not None
+        p = embedder.ucg_rate
+        val = embedder.legacy_ucg_val
+        for i in range(len(batch[embedder.input_key])):
+            if embedder.ucg_prng.choice(2, p=[1 - p, p]):
+                batch[embedder.input_key][i] = val
+        return batch
+
+    def forward(self, batch: dict, force_zero_embeddings: Optional[list] = None) -> dict:
+        output = dict()
+        force_zero_embeddings = default(force_zero_embeddings, list())
+        for embedder in self.embedders:
+            embedding_context = nullcontext if embedder.is_trainable else torch.no_grad
+            with embedding_context():
+                if hasattr(embedder, "input_key") and embedder.input_key is not None:
+                    if embedder.legacy_ucg_val is not None:
+                        batch = self.possibly_get_ucg_val(embedder, batch)
+                    if embedder.input_key in batch:
+                        emb_out_1s = []
+                        # TODO this should be a parameter
+                        for i in range(batch[embedder.input_key].shape[0]):
+                            emb_out_1 = embedder(batch[embedder.input_key][i].unsqueeze(0))
+                            emb_out_1s.append(emb_out_1)
+                        emb_out = torch.concat(emb_out_1s, 0)
+                    elif embedder.add_sequence_dim:  # concatenation
+                        emb_dim = embedder.num_features * embedder.outdim
+                        emb_out = torch.zeros((batch["cond_aug"].shape[0], 1, emb_dim), device=batch["cond_aug"].device)
+                    else:  # addition
+                        continue
+                elif hasattr(embedder, "input_keys"):
+                    emb_out = embedder(*[batch[k] for k in embedder.input_keys])
+            assert isinstance(
+                emb_out, (torch.Tensor, list, tuple)
+            ), f"Encoder outputs must be tensors or a sequence, but got {type(emb_out)}"
+            if not isinstance(emb_out, (list, tuple)):
+                emb_out = [emb_out]
+            for emb in emb_out:
+                out_key = self.OUTPUT_DIM2KEYS[emb.dim()]
+                if embedder.ucg_rate > 0.0 and embedder.legacy_ucg_val is None:
+                    emb = (
+                            expand_dims_like(
+                                torch.bernoulli(
+                                    (1.0 - embedder.ucg_rate) * torch.ones(emb.shape[0], device=emb.device)
+                                ),
+                                emb
+                            )
+                            * emb
+                    )
+                if hasattr(embedder, "input_key") and embedder.input_key in force_zero_embeddings:
+                    emb = torch.zeros_like(emb)
+                if out_key in output:
+                    if emb.shape[-1] == 768 and out_key == "vector":
+                        output[out_key] += emb
+                    else:
+                        output[out_key] = torch.cat((output[out_key], emb), self.KEY2CATDIM[out_key])
+                else:
+                    output[out_key] = emb
+        return output
+
+    def get_unconditional_conditioning(
+            self,
+            batch_c: dict,
+            batch_uc: Optional[dict] = None,
+            force_cond_zero_embeddings: Optional[list[str]] = None,
+            force_uc_zero_embeddings: Optional[list[str]] = None
+    ):
+        ucg_rates = list()
+        for embedder in self.embedders:
+            ucg_rates.append(embedder.ucg_rate)
+            embedder.ucg_rate = 0.0
+
+        c = self(batch_c, force_cond_zero_embeddings)
+        uc = self(batch_c if batch_uc is None else batch_uc, force_uc_zero_embeddings)
+
+        for embedder, rate in zip(self.embedders, ucg_rates):
+            embedder.ucg_rate = rate
+        return c, uc
+
+
+class FrozenCLIPEmbedder(AbstractEmbModel):
+    """Uses the CLIP transformer encoder for text (from huggingface)."""
+
+    def __init__(
+            self,
+            # version="path_to/openai/clip-vit-large-patch14/pytorch_model.bin",
+            version="openai/clip-vit-large-patch14",
+            device="cuda",
+            max_length=77,
+            freeze=True,
+            layer="last",
+            layer_idx=None,
+            always_return_pooled=False
+    ):  # clip-vit-base-patch32
+        super().__init__()
+        assert layer in ["last", "pooled", "hidden"]
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        self.layer_idx = layer_idx
+        self.return_pooled = always_return_pooled
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert 0 <= abs(layer_idx) <= 12
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @autocast
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt"
+        )
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(
+            input_ids=tokens,
+            output_hidden_states=self.layer == "hidden"
+        )
+        if self.layer == "last":
+            z = outputs.last_hidden_state
+        elif self.layer == "pooled":
+            z = outputs.pooler_output[:, None]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
+        if self.return_pooled:
+            return z, outputs.pooler_output
+        else:
+            return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenOpenCLIPImageEmbedder(AbstractEmbModel):
+    """Uses the OpenCLIP vision transformer encoder for images."""
+
+    def __init__(
+            self,
+            arch="ViT-H-14",
+            # version="path_to/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin",
+            version="laion2b_s32b_b79k",
+            device="cuda",
+            max_length=77,
+            freeze=True,
+            antialias=True,
+            ucg_rate=0.0,
+            unsqueeze_dim=False,
+            repeat_to_max_len=False,
+            num_image_crops=0,
+            output_tokens=False,
+            init_device=None
+    ):
+        super().__init__()
+        model, _, _ = open_clip.create_model_and_transforms(
+            arch,
+            device=torch.device(default(init_device, "cpu")),
+            pretrained=version
+        )
+        del model.transformer
+        self.model = model
+        self.max_crops = num_image_crops
+        self.pad_to_max_len = self.max_crops > 0
+        self.repeat_to_max_len = repeat_to_max_len and (not self.pad_to_max_len)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+
+        self.antialias = antialias
+
+        self.register_buffer("mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer("std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+        self.ucg_rate = ucg_rate
+        self.unsqueeze_dim = unsqueeze_dim
+        self.stored_batch = None
+        self.model.visual.output_tokens = output_tokens
+        self.output_tokens = output_tokens
+
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(
+            x,
+            (224, 224),
+            interpolation="bicubic",
+            align_corners=True,
+            antialias=self.antialias
+        )
+        x = (x + 1.0) / 2.0
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @autocast
+    def forward(self, image, no_dropout=False):
+        z = self.encode_with_vision_transformer(image)
+        tokens = None
+        if self.output_tokens:
+            z, tokens = z[0], z[1]
+        z = z.to(image.dtype)
+        if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
+            z = (
+                    torch.bernoulli(
+                        (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
+                    )[:, None]
+                    * z
+            )
+            if tokens is not None:
+                tokens = (
+                        expand_dims_like(
+                            torch.bernoulli(
+                                (1.0 - self.ucg_rate) * torch.ones(tokens.shape[0], device=tokens.device)
+                            ),
+                            tokens
+                        )
+                        * tokens
+                )
+        if self.unsqueeze_dim:
+            z = z[:, None]
+        if self.output_tokens:
+            assert not self.repeat_to_max_len
+            assert not self.pad_to_max_len
+            return tokens, z
+        elif self.repeat_to_max_len:
+            if z.dim() == 2:
+                z_ = z[:, None]
+            else:
+                z_ = z
+            return repeat(z_, "b 1 d -> b n d", n=self.max_length), z
+        elif self.pad_to_max_len:
+            assert z.dim() == 3
+            z_pad = torch.cat(
+                (
+                    z,
+                    torch.zeros(z.shape[0], self.max_length - z.shape[1], z.shape[2], device=z.device)
+                ),
+                1
+            )
+            return z_pad, z_pad[:, 0, ...]
+        else:
+            return z
+
+    def encode_with_vision_transformer(self, img):
+        if img.dim() == 5:
+            assert self.max_crops == img.shape[1]
+            img = rearrange(img, "b n c h w -> (b n) c h w")
+        img = self.preprocess(img)
+        if self.output_tokens:
+            assert self.model.visual.output_tokens
+            x, tokens = self.model.visual(img)
+        else:
+            assert not self.model.visual.output_tokens
+            x = self.model.visual(img)
+            tokens = None
+        if self.max_crops > 0:
+            x = rearrange(x, "(b n) d -> b n d", n=self.max_crops)
+            # drop out between 0 and all along the sequence axis
+            x = (
+                    torch.bernoulli(
+                        (1.0 - self.ucg_rate) * torch.ones(x.shape[0], x.shape[1], 1, device=x.device)
+                    )
+                    * x
+            )
+            if tokens is not None:
+                tokens = rearrange(tokens, "(b n) t d -> b t (n d)", n=self.max_crops)
+                print(
+                    f"You are running very experimental token-concat in {self.__class__.__name__}. "
+                    f"Check what you are doing, and then remove this message"
+                )
+        if self.output_tokens:
+            return x, tokens
+        else:
+            return x
+
+    def encode(self, text):
+        return self(text)
+
+
+class ConcatTimestepEmbedderND(AbstractEmbModel):
+    """Embeds each dimension independently and concatenates them."""
+
+    def __init__(self, outdim, num_features=None, add_sequence_dim=False):
+        super().__init__()
+        self.timestep = Timestep(outdim)
+        self.outdim = outdim
+        self.num_features = num_features
+        self.add_sequence_dim = add_sequence_dim
+
+    def forward(self, x):
+        if x.ndim == 1:
+            x = x[:, None]
+        assert len(x.shape) == 2
+        b, dims = x.shape[0], x.shape[1]
+        assert dims == self.num_features or self.num_features is None
+        x = rearrange(x, "b d -> (b d)")
+        emb = self.timestep(x)
+        emb = rearrange(emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
+        if self.add_sequence_dim:
+            emb = emb[:, None]
+        return emb
+
+
+class VideoPredictionEmbedderWithEncoder(AbstractEmbModel):
+    def __init__(
+            self,
+            n_cond_frames: int,
+            n_copies: int,
+            encoder_config: dict,
+            sigma_sampler_config: Optional[dict] = None,
+            sigma_cond_config: Optional[dict] = None,
+            is_ae: bool = False,
+            scale_factor: float = 1.0,
+            disable_encoder_autocast: bool = False,
+            en_and_decode_n_samples_a_time: Optional[int] = None
+    ):
+        super().__init__()
+        self.n_cond_frames = n_cond_frames
+        self.n_copies = n_copies
+        self.encoder = instantiate_from_config(encoder_config)
+        self.sigma_sampler = (
+            instantiate_from_config(sigma_sampler_config)
+            if sigma_sampler_config is not None
+            else None
+        )
+        self.sigma_cond = (
+            instantiate_from_config(sigma_cond_config)
+            if sigma_cond_config is not None
+            else None
+        )
+        self.is_ae = is_ae
+        self.scale_factor = scale_factor
+        self.disable_encoder_autocast = disable_encoder_autocast
+        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time
+        self.skip_encode = False
+
+    def forward(
+            self, vid: torch.Tensor
+    ) -> Union[
+        torch.Tensor,
+        tuple[torch.Tensor, torch.Tensor],
+        tuple[torch.Tensor, dict],
+        tuple[tuple[torch.Tensor, torch.Tensor], dict]
+    ]:
+        if self.skip_encode:
+            return vid
+        else:
+            if self.sigma_sampler is not None:
+                bs = vid.shape[0] // self.n_cond_frames
+                sigmas = self.sigma_sampler(bs).to(vid.device)
+                if self.sigma_cond is not None:
+                    sigma_cond = self.sigma_cond(sigmas)
+                    sigma_cond = repeat(sigma_cond, "b d -> (b t) d", t=self.n_copies)
+                sigmas = repeat(sigmas, "b -> (b t)", t=self.n_cond_frames)
+                noise = torch.randn_like(vid)
+                vid = vid + noise * append_dims(sigmas, vid.ndim)
+
+            with torch.autocast("cuda", enabled=not self.disable_encoder_autocast):
+                n_samples = default(self.en_and_decode_n_samples_a_time, vid.shape[0])
+                n_rounds = math.ceil(vid.shape[0] / n_samples)
+                all_out = list()
+                for n in range(n_rounds):
+                    if self.is_ae:
+                        out = self.encoder.encode(vid[n * n_samples: (n + 1) * n_samples])
+                    else:
+                        out = self.encoder(vid[n * n_samples: (n + 1) * n_samples])
+                    all_out.append(out)
+
+            vid = torch.cat(all_out, dim=0)
+            vid *= self.scale_factor
+
+            vid = rearrange(vid, "(b t) c h w -> b () (t c) h w", t=self.n_cond_frames)
+            vid = repeat(vid, "b 1 c h w -> (b t) c h w", t=self.n_copies)
+
+            if self.sigma_cond is not None:
+                return vid, sigma_cond
+            else:
+                return vid
+
+
+class FrozenOpenCLIPImagePredictionEmbedder(AbstractEmbModel):
+    def __init__(self, open_clip_embedding_config: dict, n_cond_frames: int, n_copies: int):
+        super().__init__()
+        self.n_cond_frames = n_cond_frames
+        self.n_copies = n_copies
+        self.open_clip = instantiate_from_config(open_clip_embedding_config)
+
+    def forward(self, vid):
+        vid = self.open_clip(vid)
+        vid = rearrange(vid, "(b t) d -> b t d", t=self.n_cond_frames)
+        vid = repeat(vid, "b t d -> (b s) t d", s=self.n_copies)
+        return vid
diff --git a/vista/vwm/modules/video_attention.py b/vista/vwm/modules/video_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f6191d01e6f0d2945b9322e479e579cb1b76e9
--- /dev/null
+++ b/vista/vwm/modules/video_attention.py
@@ -0,0 +1,298 @@
+from __future__ import annotations
+
+from ..modules.attention import *
+from ..modules.diffusionmodules.util import AlphaBlender, linear, timestep_embedding
+
+
+class TimeMixSequential(nn.Sequential):
+    def forward(self, x, context=None, timesteps=None):
+        for layer in self:
+            x = layer(x, context, timesteps)
+        return x
+
+
+class VideoTransformerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention  # ampere
+    }
+
+    def __init__(
+            self,
+            dim,
+            n_heads,
+            d_head,
+            dropout=0.0,
+            context_dim=None,
+            gated_ff=True,
+            use_checkpoint=False,
+            timesteps=None,
+            ff_in=False,
+            inner_dim=None,
+            attn_mode="softmax",
+            disable_self_attn=False,
+            disable_temporal_crossattention=False,
+            switch_temporal_ca_to_sa=False,
+            add_lora=False,
+            action_control=False
+    ):
+        super().__init__()
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+
+        self.ff_in = ff_in or inner_dim is not None
+        if inner_dim is None:
+            inner_dim = dim
+
+        assert int(n_heads * d_head) == inner_dim
+
+        self.is_res = inner_dim == dim
+
+        if self.ff_in:
+            self.norm_in = nn.LayerNorm(dim)
+            self.ff_in = FeedForward(dim, dim_out=inner_dim, dropout=dropout, glu=gated_ff)
+
+        self.timesteps = timesteps
+        self.disable_self_attn = disable_self_attn
+        if disable_self_attn:
+            self.attn1 = attn_cls(
+                query_dim=inner_dim,
+                context_dim=context_dim,
+                heads=n_heads,
+                dim_head=d_head,
+                dropout=dropout,
+                add_lora=add_lora
+            )  # is a cross-attn
+        else:
+            self.attn1 = attn_cls(
+                query_dim=inner_dim,
+                heads=n_heads,
+                dim_head=d_head,
+                dropout=dropout,
+                causal=False,
+                add_lora=add_lora
+            )  # is a self-attn
+
+        self.ff = FeedForward(inner_dim, dim_out=dim, dropout=dropout, glu=gated_ff)
+
+        if not disable_temporal_crossattention:
+            self.norm2 = nn.LayerNorm(inner_dim)
+            if switch_temporal_ca_to_sa:
+                self.attn2 = attn_cls(
+                    query_dim=inner_dim,
+                    heads=n_heads,
+                    dim_head=d_head,
+                    dropout=dropout,
+                    causal=False,
+                    add_lora=add_lora
+                )  # is a self-attn
+            else:
+                self.attn2 = attn_cls(
+                    query_dim=inner_dim,
+                    context_dim=context_dim,
+                    heads=n_heads,
+                    dim_head=d_head,
+                    dropout=dropout,
+                    add_lora=add_lora,
+                    action_control=action_control
+                )  # is self-attn if context is None
+
+        self.norm1 = nn.LayerNorm(inner_dim)
+        self.norm3 = nn.LayerNorm(inner_dim)
+        self.switch_temporal_ca_to_sa = switch_temporal_ca_to_sa
+
+        self.use_checkpoint = use_checkpoint
+        if self.use_checkpoint:
+            print(f"{self.__class__.__name__} is using checkpointing")
+
+    def forward(self, x: torch.Tensor, context: torch.Tensor = None, timesteps: int = None) -> torch.Tensor:
+        if self.use_checkpoint:
+            return checkpoint(self._forward, x, context, timesteps)
+        else:
+            return self._forward(x, context, timesteps=timesteps)
+
+    def _forward(self, x, context=None, timesteps=None):
+        assert self.timesteps or timesteps
+        assert not (self.timesteps and timesteps) or self.timesteps == timesteps
+        timesteps = self.timesteps or timesteps
+        B, S, C = x.shape
+        x = rearrange(x, "(b t) s c -> (b s) t c", t=timesteps)
+
+        if self.ff_in:
+            x_skip = x
+            x = self.ff_in(self.norm_in(x))
+            if self.is_res:
+                x += x_skip
+
+        if self.disable_self_attn:
+            x = self.attn1(self.norm1(x), context=context, batchify_xformers=True) + x
+        else:  # this way
+            x = self.attn1(self.norm1(x), batchify_xformers=True) + x
+
+        if hasattr(self, "attn2"):
+            if self.switch_temporal_ca_to_sa:
+                x = self.attn2(self.norm2(x), batchify_xformers=True) + x
+            else:  # this way
+                x = self.attn2(self.norm2(x), context=context, batchify_xformers=True) + x
+
+        x_skip = x
+        x = self.ff(self.norm3(x))
+        if self.is_res:
+            x += x_skip
+
+        x = rearrange(x, "(b s) t c -> (b t) s c", s=S, b=B // timesteps, c=C, t=timesteps)
+        return x
+
+    def get_last_layer(self):
+        return self.ff.net[-1].weight
+
+
+class SpatialVideoTransformer(SpatialTransformer):
+    def __init__(
+            self,
+            in_channels,
+            n_heads,
+            d_head,
+            depth=1,
+            dropout=0.0,
+            use_linear=False,
+            context_dim=None,
+            use_spatial_context=False,
+            timesteps=None,
+            merge_strategy: str = "fixed",
+            merge_factor: float = 0.5,
+            time_context_dim=None,
+            ff_in=False,
+            use_checkpoint=False,
+            time_depth=1,
+            attn_mode="softmax",
+            disable_self_attn=False,
+            disable_temporal_crossattention=False,
+            max_time_embed_period=10000,
+            add_lora=False,
+            action_control=False
+    ):
+        super().__init__(
+            in_channels,
+            n_heads,
+            d_head,
+            depth=depth,
+            dropout=dropout,
+            attn_type=attn_mode,
+            use_checkpoint=use_checkpoint,
+            context_dim=context_dim,
+            use_linear=use_linear,
+            disable_self_attn=disable_self_attn,
+            add_lora=add_lora,
+            action_control=action_control
+        )
+        self.time_depth = time_depth
+        self.depth = depth
+        self.max_time_embed_period = max_time_embed_period
+
+        time_mix_d_head = d_head
+        n_time_mix_heads = n_heads
+
+        time_mix_inner_dim = int(time_mix_d_head * n_time_mix_heads)
+
+        inner_dim = n_heads * d_head
+        if use_spatial_context:
+            time_context_dim = context_dim
+
+        self.time_stack = nn.ModuleList(
+            [
+                VideoTransformerBlock(
+                    inner_dim,
+                    n_time_mix_heads,
+                    time_mix_d_head,
+                    dropout=dropout,
+                    context_dim=time_context_dim,
+                    timesteps=timesteps,
+                    use_checkpoint=use_checkpoint,
+                    ff_in=ff_in,
+                    inner_dim=time_mix_inner_dim,
+                    attn_mode=attn_mode,
+                    disable_self_attn=disable_self_attn,
+                    disable_temporal_crossattention=disable_temporal_crossattention,
+                    add_lora=add_lora,
+                    action_control=action_control
+                )
+                for _ in range(self.depth)
+            ]
+        )
+
+        assert len(self.time_stack) == len(self.transformer_blocks)
+
+        self.use_spatial_context = use_spatial_context
+        self.in_channels = in_channels
+
+        time_embed_dim = in_channels * 4
+        self.time_pos_embed = nn.Sequential(
+            linear(in_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, in_channels)
+        )
+
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor,
+            merge_strategy=merge_strategy,
+            rearrange_pattern="b t -> (b t) 1 1"
+        )
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            context: Optional[torch.Tensor] = None,
+            time_context: Optional[torch.Tensor] = None,
+            timesteps: Optional[int] = None
+    ) -> torch.Tensor:
+        _, _, h, w = x.shape
+        x_in = x
+        spatial_context = None
+        if exists(context):
+            spatial_context = context
+
+        if self.use_spatial_context:
+            assert context.ndim == 3, f"Dims of spatial context should be 3 but are {context.ndim}"
+
+            time_context = context
+            time_context_first_timestep = time_context[::timesteps]
+            time_context = repeat(time_context_first_timestep, "b ... -> (b n) ...", n=h * w)
+        elif time_context is not None and not self.use_spatial_context:
+            time_context = repeat(time_context, "b ... -> (b n) ...", n=h * w)
+            if time_context.ndim == 2:
+                time_context = rearrange(time_context, "b c -> b 1 c")
+
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        if self.use_linear:
+            x = self.proj_in(x)
+
+        num_frames = torch.arange(timesteps, device=x.device)
+        num_frames = repeat(num_frames, "t -> (b t)", b=x.shape[0] // timesteps)
+        t_emb = timestep_embedding(
+            num_frames,
+            self.in_channels,
+            repeat_only=False,
+            max_period=self.max_time_embed_period
+        )
+        emb = self.time_pos_embed(t_emb)
+        emb = emb[:, None]
+
+        for block, mix_block in zip(self.transformer_blocks, self.time_stack):
+            x = block(x, context=spatial_context)
+
+            x_mix = x
+            x_mix = x_mix + emb
+
+            x_mix = mix_block(x_mix, context=time_context, timesteps=timesteps)
+            x = self.time_mixer(x_spatial=x, x_temporal=x_mix)
+
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        if not self.use_linear:
+            x = self.proj_out(x)
+        out = x + x_in
+        return out
diff --git a/vista/vwm/util.py b/vista/vwm/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a970ebef768f9215dfa430a8c2078f4191297bb
--- /dev/null
+++ b/vista/vwm/util.py
@@ -0,0 +1,199 @@
+from __future__ import annotations
+
+import functools
+import importlib
+import os
+from functools import partial
+from inspect import isfunction
+
+import fsspec
+import torch
+from einops import repeat
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode does not change anymore."""
+
+    return self
+
+
+def get_string_from_tuple(s):
+    try:
+        # check if the string starts and ends with parentheses
+        if s[0] == "(" and s[-1] == ")":
+            # convert the string to a tuple
+            t = eval(s)
+            # check if the type of t is tuple
+            if isinstance(t, tuple):
+                return t[0]
+            else:
+                pass
+    except:
+        pass
+    return s
+
+
+def is_power_of_two(n):
+    """Return True if n is a power of 2, otherwise return False."""
+
+    if n <= 0:
+        return False
+    else:
+        return (n & (n - 1)) == 0
+
+
+def autocast(f, enabled=True):
+    def do_autocast(*args, **kwargs):
+        with torch.cuda.amp.autocast(
+                enabled=enabled,
+                dtype=torch.get_autocast_gpu_dtype(),
+                cache_enabled=torch.is_autocast_cache_enabled()
+        ):
+            return f(*args, **kwargs)
+
+    return do_autocast
+
+
+def load_partial_from_config(config):
+    return partial(get_obj_from_str(config["target"]), **config.get("params", dict()))
+
+
+def repeat_as_img_seq(x, num_frames):
+    if x is not None:
+        if isinstance(x, list):
+            new_x = list()
+            for item_x in x:
+                new_x += [item_x] * num_frames
+            return new_x
+        else:
+            x = x.unsqueeze(1)
+            x = repeat(x, "b 1 ... -> (b t) ...", t=num_frames)
+            return x
+    else:
+        return None
+
+
+def partialclass(cls, *args, **kwargs):
+    class NewCls(cls):
+        __init__ = functools.partialmethod(cls.__init__, *args, **kwargs)
+
+    return NewCls
+
+
+def make_path_absolute(path):
+    fs, p = fsspec.core.url_to_fs(path)
+    if fs.protocol == "file":
+        return os.path.abspath(p)
+    else:
+        return path
+
+
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    else:
+        return (len(x.shape) == 4) and (x.shape[1] > 3)
+
+
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    else:
+        return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+
+
+def isheatmap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    else:
+        return x.ndim == 2
+
+
+def isneighbors(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    else:
+        return x.ndim == 5 and (x.shape[2] == 3 or x.shape[2] == 1)
+
+
+def exists(x):
+    return x is not None
+
+
+def expand_dims_like(x, y):
+    while x.dim() != y.dim():
+        x = x.unsqueeze(-1)
+    return x
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    else:
+        return d() if isfunction(d) else d
+
+
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params")
+    return total_params
+
+
+def instantiate_from_config(config):
+    if "target" not in config:
+        if config == "__is_first_stage__":
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        else:
+            raise KeyError("Expected key `target` to instantiate")
+    else:
+        return get_obj_from_str(config["target"])(**config.get("params", dict()))
+
+
+def get_obj_from_str(string, reload=False, invalidate_cache=True):
+    module, cls = string.rsplit(".", 1)
+    if invalidate_cache:
+        importlib.invalidate_caches()
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+def append_zero(x):
+    return torch.cat((x, x.new_zeros([1])))
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"Input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+def get_configs_path() -> str:
+    """Get the `configs` directory."""
+
+    this_dir = os.path.dirname(__file__)
+    candidates = (
+        os.path.join(this_dir, "configs"),
+        os.path.join(this_dir, "..", "configs")
+    )
+    for candidate in candidates:
+        candidate = os.path.abspath(candidate)
+        if os.path.isdir(candidate):
+            return candidate
+    raise FileNotFoundError(f"Could not find configs in {candidates}")