HongyuJia commited on Apr 25

Commit

3e6cbc8

0 Parent(s):

Init commit

Files changed (34) hide show

.gitattributes +38 -0
README.md +233 -0
ckpt/magi/24B_base/inference_weight/model-00001-of-00006.safetensors +3 -0
ckpt/magi/24B_base/inference_weight/model-00002-of-00006.safetensors +3 -0
ckpt/magi/24B_base/inference_weight/model-00003-of-00006.safetensors +3 -0
ckpt/magi/24B_base/inference_weight/model-00004-of-00006.safetensors +3 -0
ckpt/magi/24B_base/inference_weight/model-00005-of-00006.safetensors +3 -0
ckpt/magi/24B_base/inference_weight/model-00006-of-00006.safetensors +3 -0
ckpt/magi/24B_base/inference_weight/model.safetensors.index.json +0 -0
ckpt/magi/24B_distill/inference_weight.distill/model-00001-of-00006.safetensors +3 -0
ckpt/magi/24B_distill/inference_weight.distill/model-00002-of-00006.safetensors +3 -0
ckpt/magi/24B_distill/inference_weight.distill/model-00003-of-00006.safetensors +3 -0
ckpt/magi/24B_distill/inference_weight.distill/model-00004-of-00006.safetensors +3 -0
ckpt/magi/24B_distill/inference_weight.distill/model-00005-of-00006.safetensors +3 -0
ckpt/magi/24B_distill/inference_weight.distill/model-00006-of-00006.safetensors +3 -0
ckpt/magi/24B_distill/inference_weight.distill/model.safetensors.index.json +0 -0
ckpt/magi/24B_distill_quant/inference_weight.fp8.distill/model-00001-of-00003.safetensors +3 -0
ckpt/magi/24B_distill_quant/inference_weight.fp8.distill/model-00002-of-00003.safetensors +3 -0
ckpt/magi/24B_distill_quant/inference_weight.fp8.distill/model-00003-of-00003.safetensors +3 -0
ckpt/magi/24B_distill_quant/inference_weight.fp8.distill/model.safetensors.index.json +0 -0
ckpt/t5/t5-v1_1-xxl/config.json +31 -0
ckpt/t5/t5-v1_1-xxl/pytorch_model-00001-of-00002.bin +3 -0
ckpt/t5/t5-v1_1-xxl/pytorch_model-00002-of-00002.bin +3 -0
ckpt/t5/t5-v1_1-xxl/pytorch_model.bin.index.json +227 -0
ckpt/t5/t5-v1_1-xxl/special_tokens_map.json +1 -0
ckpt/t5/t5-v1_1-xxl/spiece.model +3 -0
ckpt/t5/t5-v1_1-xxl/t5-v1_1-xxl/pytorch_model-00001-of-00002.bin +3 -0
ckpt/t5/t5-v1_1-xxl/tokenizer_config.json +1 -0
ckpt/vae/config.json +22 -0
ckpt/vae/diffusion_pytorch_model.safetensors +3 -0
figures/algorithm.png +3 -0
figures/dit_architecture.png +3 -0
figures/inhouse_human_evaluation.png +3 -0
figures/logo_black.png +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+figures/algorithm.png filter=lfs diff=lfs merge=lfs -text
+figures/dit_architecture.png filter=lfs diff=lfs merge=lfs -text
+figures/inhouse_human_evaluation.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,233 @@

+---
+license: apache-2.0
+language:
+- en
+pipeline_tag: image-to-video
+library_name: MAGI-1
+---
+![magi-logo](figures/logo_black.png)
+-----
+<p align="center" style="line-height: 1;">
+  <a href="https://static.magi.world/static/files/MAGI_1.pdf" target="_blank" style="margin: 2px;">
+    <img alt="paper" src="https://img.shields.io/badge/Paper-arXiv-B31B1B?logo=arxiv" style="display: inline-block; vertical-align: middle;">
+  </a>
+  <a href="https://sand.ai" target="_blank" style="margin: 2px;">
+    <img alt="blog" src="https://img.shields.io/badge/Sand%20AI-Homepage-333333.svg?logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iODAwIiBoZWlnaHQ9IjgwMCIgdmlld0JveD0iMCAwIDgwMCA4MDAiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJNMjI3IDIyNS4wODVDMjI3IDIwMi4zMDMgMjI3IDE5MC45MTIgMjMxLjQzNyAxODIuMjExQzIzNS4zMzkgMTc0LjU1NyAyNDEuNTY2IDE2OC4zMzQgMjQ5LjIyNiAxNjQuNDM0QzI1Ny45MzMgMTYwIDI2OS4zMzIgMTYwIDI5Mi4xMjkgMTYwSDUwNy44NzFDNTA5LjI5NSAxNjAgNTEwLjY3NiAxNjAgNTEyLjAxNCAxNjAuMDAxQzUzMi4wODIgMTYwLjAxNyA1NDIuNjExIDE2MC4yNzcgNTUwLjc3NCAxNjQuNDM0QzU1OC40MzQgMTY4LjMzNCA1NjQuNjYxIDE3NC41NTcgNTY4LjU2MyAxODIuMjExQzU3MyAxOTAuOTEyIDU3MyAyMDIuMzAzIDU3MyAyMjUuMDg1VjI1Ni41NThDNTczIDI5MS4zMTkgNTczIDMwOC43IDU2NS4wMzUgMzIzLjI3OUM1NTguNzU2IDMzNC43NzIgNTQzLjU2NSAzNDYuMTEgNTIzLjA3OCAzNTkuNjA1QzUxNC42NzQgMzY1LjE0MSA1MTAuNDcyIDM2Ny45MDkgNTA1LjYzOSAzNjcuOTM2QzUwMC44MDYgMzY3Ljk2NCA0OTYuNTAzIDM2NS4yIDQ4Ny44OTYgMzU5LjY3MUw0ODcuODk2IDM1OS42N0w0NjYuNDY5IDM0NS45MDVDNDU2Ljg3NSAzMzkuNzQyIDQ1Mi4wNzggMzM2LjY2IDQ1Mi4wNzggMzMyLjIxOEM0NTIuMDc4IDMyNy43NzcgNDU2Ljg3NSAzMjQuNjk1IDQ2Ni40NjkgMzE4LjUzMUw1MjYuNzgyIDI3OS43ODVDNTM1LjI5MSAyNzQuMzE5IDU0MC40MzUgMjY0LjkwMyA1NDAuNDM1IDI1NC43OTRDNTQwLjQzNSAyMzguMzg2IDUyNy4xMjUgMjI1LjA4NSA1MTAuNzA1IDIyNS4wODVIMjg5LjI5NUMyNzIuODc1IDIyNS4wODUgMjU5LjU2NSAyMzguMzg2IDI1OS41NjUgMjU0Ljc5NEMyNTkuNTY1IDI2NC45MDMgMjY0LjcwOSAyNzQuMzE5IDI3My4yMTggMjc5Ljc4NUw1MTMuMTggNDMzLjk0MUM1NDIuNDQxIDQ1Mi43MzggNTU3LjA3MSA0NjIuMTM3IDU2NS4wMzUgNDc2LjcxNkM1NzMgNDkxLjI5NCA1NzMgNTA4LjY3NSA1NzMgNTQzLjQzNlY1NzQuOTE1QzU3MyA1OTcuNjk3IDU3MyA2MDkuMDg4IDU2OC41NjMgNjE3Ljc4OUM1NjQuNjYxIDYyNS40NDQgNTU4LjQzNCA2MzEuNjY2IDU1MC43NzQgNjM1LjU2NkM1NDIuMDY3IDY0MCA1MzAuNjY4IDY0MCA1MDcuODcxIDY0MEgyOTIuMTI5QzI2OS4zMzIgNjQwIDI1Ny45MzMgNjQwIDI0OS4yMjYgNjM1LjU2NkMyNDEuNTY2IDYzMS42NjYgMjM1LjMzOSA2MjUuNDQ0IDIzMS40MzcgNjE3Ljc4OUMyMjcgNjA5LjA4OCAyMjcgNTk3LjY5NyAyMjcgNTc0LjkxNVY1NDMuNDM2QzIyNyA1MDguNjc1IDIyNyA0OTEuMjk0IDIzNC45NjUgNDc2LjcxNkMyNDEuMjQ0IDQ2NS4yMjIgMjU2LjQzMyA0NTMuODg2IDI3Ni45MTggNDQwLjM5MkMyODUuMzIyIDQzNC44NTYgMjg5LjUyNSA0MzIuMDg4IDI5NC4zNTcgNDMyLjA2QzI5OS4xOSA0MzIuMDMyIDMwMy40OTQgNDM0Ljc5NyAzMTIuMSA0NDAuMzI2TDMzMy41MjcgNDU0LjA5MUMzNDMuMTIyIDQ2MC4yNTQgMzQ3LjkxOSA0NjMuMzM2IDM0Ny45MTkgNDY3Ljc3OEMzNDcuOTE5IDQ3Mi4yMiAzNDMuMTIyIDQ3NS4zMDEgMzMzLjUyOCA0ODEuNDY1TDMzMy41MjcgNDgxLjQ2NUwyNzMuMjIgNTIwLjIwOEMyNjQuNzA5IDUyNS42NzUgMjU5LjU2NSA1MzUuMDkxIDI1OS41NjUgNTQ1LjIwMkMyNTkuNTY1IDU2MS42MTIgMjcyLjg3NyA1NzQuOTE1IDI4OS4yOTkgNTc0LjkxNUg1MTAuNzAxQzUyNy4xMjMgNTc0LjkxNSA1NDAuNDM1IDU2MS42MTIgNTQwLjQzNSA1NDUuMjAyQzU0MC40MzUgNTM1LjA5MSA1MzUuMjkxIDUyNS42NzUgNTI2Ljc4IDUyMC4yMDhMMjg2LjgyIDM2Ni4wNTNDMjU3LjU2IDM0Ny4yNTYgMjQyLjkyOSAzMzcuODU3IDIzNC45NjUgMzIzLjI3OUMyMjcgMzA4LjcgMjI3IDI5MS4zMTkgMjI3IDI1Ni41NThWMjI1LjA4NVoiIGZpbGw9IiNGRkZGRkYiLz4KPC9zdmc+Cg==" style="display: inline-block; vertical-align: middle;">
+  </a>
+  <a href="https://magi.sand.ai" target="_blank" style="margin: 2px;">
+    <img alt="product" src="https://img.shields.io/badge/Magi-Product-logo.svg?logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iODAwIiBoZWlnaHQ9IjgwMCIgdmlld0JveD0iMCAwIDgwMCA4MDAiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJNNDY5LjAyNyA1MDcuOTUxVjE4MC4zNjRDNDY5LjAyNyAxNjguNDE2IDQ2OS4wMjcgMTYyLjQ0MiA0NjUuMjQ0IDE2MC41MTlDNDYxLjQ2MSAxNTguNTk2IDQ1Ni42NTkgMTYyLjEzIDQ0Ny4wNTYgMTY5LjE5OEwzNjEuMDQ4IDIzMi40OTZDMzQ2LjI5NiAyNDMuMzUzIDMzOC45MjEgMjQ4Ljc4MSAzMzQuOTQ3IDI1Ni42NUMzMzAuOTczIDI2NC41MTggMzMwLjk3MyAyNzMuNjk1IDMzMC45NzMgMjkyLjA0OVY2MTkuNjM2QzMzMC45NzMgNjMxLjU4NCAzMzAuOTczIDYzNy41NTggMzM0Ljc1NiA2MzkuNDgxQzMzOC41MzkgNjQxLjQwNCAzNDMuMzQxIDYzNy44NyAzNTIuOTQ0IDYzMC44MDJMNDM4Ljk1MiA1NjcuNTA0QzQ1My43MDQgNTU2LjY0OCA0NjEuMDggNTUxLjIxOSA0NjUuMDUzIDU0My4zNUM0NjkuMDI3IDUzNS40ODIgNDY5LjAyNyA1MjYuMzA1IDQ2OS4wMjcgNTA3Ljk1MVpNMjg3LjkwNyA0OTQuMTU1VjIyMS45M0MyODcuOTA3IDIxNC4wMDIgMjg3LjkwNyAyMTAuMDM5IDI4NS4zOTQgMjA4Ljc1NEMyODIuODgxIDIwNy40NyAyNzkuNjg0IDIwOS44MDEgMjczLjI5MiAyMTQuNDYyTDIwOS40MjEgMjYxLjAzMkMxOTguMjYyIDI2OS4xNjggMTkyLjY4MyAyNzMuMjM2IDE4OS42NzUgMjc5LjE2QzE4Ni42NjcgMjg1LjA4NCAxODYuNjY3IDI5Mi4wMDMgMTg2LjY2NyAzMDUuODQxVjU3OC4wNjdDMTg2LjY2NyA1ODUuOTk0IDE4Ni42NjcgNTg5Ljk1OCAxODkuMTggNTkxLjI0MkMxOTEuNjkzIDU5Mi41MjYgMTk0Ljg4OSA1OTAuMTk2IDIwMS4yODIgNTg1LjUzNUwyNjUuMTUyIDUzOC45NjVDMjc2LjMxMSA1MzAuODI5IDI4MS44OSA1MjYuNzYxIDI4NC44OTkgNTIwLjgzN0MyODcuOTA3IDUxNC45MTMgMjg3LjkwNyA1MDcuOTk0IDI4Ny45MDcgNDk0LjE1NVpNNjEzLjMzMyAyMjEuOTNWNDk0LjE1NUM2MTMuMzMzIDUwNy45OTQgNjEzLjMzMyA1MTQuOTEzIDYxMC4zMjUgNTIwLjgzN0M2MDcuMzE3IDUyNi43NjEgNjAxLjczOCA1MzAuODI5IDU5MC41NzkgNTM4Ljk2NUw1MjYuNzA4IDU4NS41MzVDNTIwLjMxNiA1OTAuMTk2IDUxNy4xMTkgNTkyLjUyNiA1MTQuNjA2IDU5MS4yNDJDNTEyLjA5MyA1ODkuOTU4IDUxMi4wOTMgNTg1Ljk5NCA1MTIuMDkzIDU3OC4wNjdWMzA1Ljg0MUM1MTIuMDkzIDI5Mi4wMDMgNTEyLjA5MyAyODUuMDg0IDUxNS4xMDIgMjc5LjE2QzUxOC4xMSAyNzMuMjM2IDUyMy42ODkgMjY5LjE2OCA1MzQuODQ4IDI2MS4wMzJMNTk4LjcxOSAyMTQuNDYyQzYwNS4xMTEgMjA5LjgwMSA2MDguMzA3IDIwNy40NyA2MTAuODIgMjA4Ljc1NEM2MTMuMzMzIDIxMC4wMzkgNjEzLjMzMyAyMTQuMDAyIDYxMy4zMzMgMjIxLjkzWiIgZmlsbD0iI0ZGRkZGRiIgc2hhcGUtcmVuZGVyaW5nPSJjcmlzcEVkZ2VzIi8+Cjwvc3ZnPgo=&color=DCBE7E" style="display: inline-block; vertical-align: middle;">
+  </a>
+  <a href="https://huggingface.co/sand-ai" target="_blank" style="margin: 2px;">
+    <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Sand AI-ffc107?color=ffc107&logoColor=white" style="display: inline-block; vertical-align: middle;">
+  </a>
+  <a href="https://x.com/SandAI_HQ" target="_blank" style="margin: 2px;">
+    <img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-Sand%20AI-white?logo=x&logoColor=white" style="display: inline-block; vertical-align: middle;">
+  </a>
+  <a href="https://discord.gg/hgaZ86D7Wv" target="_blank" style="margin: 2px;">
+    <img alt="Discord" src="https://img.shields.io/badge/Discord-Sand%20AI-7289da?logo=discord&logoColor=white&color=7289da" style="display: inline-block; vertical-align: middle;">
+  </a>
+  <a href="https://github.com/SandAI-org/Magi/LICENSE" target="_blank" style="margin: 2px;">
+    <img alt="license" src="https://img.shields.io/badge/License-Apache2.0-green?logo=Apache" style="display: inline-block; vertical-align: middle;">
+  </a>
+</p>
+# MAGI-1: Autoregressive Video Generation at Scale
+This repository contains the code for the MAGI-1 model, pre-trained weights and inference code. You can find more information on our [technical report](https://static.magi.world/static/files/MAGI_1.pdf) or directly create magic with MAGI-1 [here](http://sand.ai) . 🚀✨
+## 🔥🔥🔥 Latest News
+- Apr 21, 2025: MAGI-1 is here 🎉. We've released the model weights and inference code — check it out!
+## 1. About
+We present MAGI-1, a world model that generates videos by ***autoregressively*** predicting a sequence of video chunks, defined as fixed-length segments of consecutive frames. Trained to denoise per-chunk noise that increases monotonically over time, MAGI-1 enables causal temporal modeling and naturally supports streaming generation. It achieves strong performance on image-to-video (I2V) tasks conditioned on text instructions, providing high temporal consistency and scalability, which are made possible by several algorithmic innovations and a dedicated infrastructure stack. MAGI-1 further supports controllable generation via chunk-wise prompting, enabling smooth scene transitions, long-horizon synthesis, and fine-grained text-driven control. We believe MAGI-1 offers a promising direction for unifying high-fidelity video generation with flexible instruction control and real-time deployment.
+## 2. Model Summary
+### Transformer-based VAE
+- Variational autoencoder (VAE) with transformer-based architecture, 8x spatial and 4x temporal compression.
+- Fastest average decoding time and highly competitive reconstruction quality
+### Auto-Regressive Denoising Algorithm
+MAGI-1 is an autoregressive denoising video generation model generating videos chunk-by-chunk instead of as a whole. Each chunk (24 frames) is denoised holistically, and the generation of the next chunk begins as soon as the current one reaches a certain level of denoising. This pipeline design enables concurrent processing of up to four chunks for efficient video generation.
+![auto-regressive denosing algorithm](figures/algorithm.png)
+### Diffusion Model Architecture
+MAGI-1 is built upon the Diffusion Transformer, incorporating several key innovations to enhance training efficiency and stability at scale. These advancements include Block-Causal Attention, Parallel Attention Block, QK-Norm and GQA, Sandwich Normalization in FFN, SwiGLU, and Softcap Modulation. For more details, please refer to the [technical report.](https://static.magi.world/static/files/MAGI_1.pdf)
+<div align="center">
+<img src="figures/dit_architecture.png" alt="diffusion model architecture" width="500" />
+</div>
+### Distillation Algorithm
+We adopt a shortcut distillation approach that trains a single velocity-based model to support variable inference budgets. By enforcing a self-consistency constraint—equating one large step with two smaller steps—the model learns to approximate flow-matching trajectories across multiple step sizes. During training, step sizes are cyclically sampled from {64, 32, 16, 8}, and classifier-free guidance distillation is incorporated to preserve conditional alignment. This enables efficient inference with minimal loss in fidelity.
+## 3. Model Zoo
+We provide the pre-trained weights for MAGI-1, including the 24B and 4.5B models, as well as the corresponding distill and distill+quant models. The model weight links are shown in the table.
+| Model                         | Link                                                         | Recommend Machine               |
+| ----------------------------- | ------------------------------------------------------------ | ------------------------------- |
+| T5 | [T5](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/t5) | - |
+| MAGI-1-VAE  | [MAGI-1-VAE](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/vae) | - |
+| MAGI-1-24B                    | [MAGI-1-24B](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/magi/24B_base)       | H100/H800 \* 8                  |
+| MAGI-1-24B-distill            | [MAGI-1-24B-distill](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/magi/24B_distill) | H100/H800 \* 8                  |
+| MAGI-1-24B-distill+fp8_quant  | [MAGI-1-24B-distill+quant](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/magi/24B_distill_quant) | H100/H800 \* 4 or RTX 4090 \* 8 |
+| MAGI-1-4.5B                   | MAGI-1-4.5B      | RTX 4090 \* 1                   |
+## 4. Evaluation
+### In-house Human Evaluation
+MAGI-1 achieves state-of-the-art performance among open-source models (surpassing Wan-2.1 and significantly outperforming Hailuo and HunyuanVideo), particularly excelling in instruction following and motion quality, positioning it as a strong potential competitor to closed-source commercial models such as Kling.
+![inhouse human evaluation](figures/inhouse_human_evaluation.png)
+### Physical Evaluation
+Thanks to the natural advantages of autoregressive architecture, Magi achieves far superior precision in predicting physical behavior through video continuation—significantly outperforming all existing models.
+| Model          | Phys. IQ Score ↑ | Spatial IoU ↑ | Spatio Temporal ↑ | Weighted Spatial IoU ↑ | MSE ↓  |
+|----------------|------------------|---------------|-------------------|-------------------------|--------|
+| **V2V Models** |                  |               |                   |                         |        |
+| **Magi (V2V)** | **56.02**        | **0.367**     | **0.270**         | **0.304**               | **0.005** |
+| VideoPoet (V2V)| 29.50            | 0.204         | 0.164             | 0.137                   | 0.010  |
+| **I2V Models** |                  |               |                   |                         |        |
+| **Magi (I2V)** | **30.23**        | **0.203**     | **0.151**         | **0.154**               | **0.012** |
+| Kling1.6 (I2V) | 23.64            | 0.197         | 0.086             | 0.144                   | 0.025  |
+| VideoPoet (I2V)| 20.30            | 0.141         | 0.126             | 0.087                   | 0.012  |
+| Gen 3 (I2V)    | 22.80            | 0.201         | 0.115             | 0.116                   | 0.015  |
+| Wan2.1 (I2V)   | 20.89            | 0.153         | 0.100             | 0.112                   | 0.023  |
+| Sora (I2V)     | 10.00            | 0.138         | 0.047             | 0.063                   | 0.030  |
+| **GroundTruth**| **100.0**        | **0.678**     | **0.535**         | **0.577**               | **0.002** |
+## 5. How to run
+### Environment Preparation
+We provide two ways to run MAGI-1, with the Docker environment being the recommended option.
+**Run with Docker Environment (Recommend)**
+```bash
+docker pull sandai/magi:latest
+docker run -it --gpus all --privileged --shm-size=32g --name magi --net=host --ipc=host --ulimit memlock=-1 --ulimit stack=6710886 sandai/magi:latest /bin/bash
+```
+**Run with Source Code**
+```bash
+# Create a new environment
+conda create -n magi python==3.10.12
+# Install pytorch
+conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.4 -c pytorch -c nvidia
+# Install other dependencies
+pip install -r requirements.txt
+# Install ffmpeg
+conda install -c conda-forge ffmpeg=4.4
+# Install MagiAttention, for more information, please refer to https://github.com/SandAI-org/MagiAttention#
+git clone [email protected]:SandAI-org/MagiAttention.git
+cd MagiAttention
+git submodule update --init --recursive
+pip install --no-build-isolation .
+```
+### Inference Command
+To run the `MagiPipeline`, you can control the input and output by modifying the parameters in the `example/24B/run.sh` or `example/4.5B/run.sh` script. Below is an explanation of the key parameters:
+#### Parameter Descriptions
+- `--config_file`: Specifies the path to the configuration file, which contains model configuration parameters, e.g., `example/24B/24B_config.json`.
+- `--mode`: Specifies the mode of operation. Available options are:
+  - `t2v`: Text to Video
+  - `i2v`: Image to Video
+  - `v2v`: Video to Video
+- `--prompt`: The text prompt used for video generation, e.g., `"Good Boy"`.
+- `--image_path`: Path to the image file, used only in `i2v` mode.
+- `--prefix_video_path`: Path to the prefix video file, used only in `v2v` mode.
+- `--output_path`: Path where the generated video file will be saved.
+#### Bash Script
+```bash
+#!/bin/bash
+# Run 24B MAGI-1 model
+bash example/24B/run.sh
+# Run 4.5B MAGI-1 model
+bash example/4.5B/run.sh
+```
+#### Customizing Parameters
+You can modify the parameters in `run.sh` as needed. For example:
+- To use the Image to Video mode (`i2v`), set `--mode` to `i2v` and provide `--image_path`:
+  ```bash
+  --mode i2v \
+  --image_path example/assets/image.jpeg \
+  ```
+- To use the Video to Video mode (`v2v`), set `--mode` to `v2v` and provide `--prefix_video_path`:
+  ```bash
+  --mode v2v \
+  --prefix_video_path example/assets/prefix_video.mp4 \
+  ```
+By adjusting these parameters, you can flexibly control the input and output to meet different requirements.
+### Some Useful Configs (for config.json)
+| Config         | Help                                                         |
+| -------------- | ------------------------------------------------------------ |
+| seed           | Random seed used for video generation                        |
+| video_size_h   | Height of the video                                          |
+| video_size_w   | Width of the video                                           |
+| num_frames     | Controls the duration of generated video                     |
+| fps            | Frames per second, 4 video frames correspond to 1 latent_frame |
+| cfg_number     | Base model uses cfg_number==2, distill and quant model uses cfg_number=1 |
+| load           | Directory containing a model checkpoint.                     |
+| t5_pretrained  | Path to load pretrained T5 model                             |
+| vae_pretrained | Path to load pretrained VAE model                            |
+## 6. License
+This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
+## 7. Citation
+If you find our code or model useful in your research, please cite:
+```bibtex
+@misc{magi1,
+      title={MAGI-1: Autoregressive Video Generation at Scale},
+      author={Sand-AI},
+      year={2025},
+      url={https://static.magi.world/static/files/MAGI_1.pdf},
+}
+```
+## 8. Contact
+If you have any questions, please feel free to raise an issue or contact us at [[email protected]]([email protected]) .

ckpt/magi/24B_base/inference_weight/model-00001-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e55c7996a8f517349a9c11b702f4217d27b8dc2bb4bbb99ab097dd66872623c
+size 4988160184

ckpt/magi/24B_base/inference_weight/model-00002-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7abb141cd258434d91ed02c4a95076da7262ce3fe9f3a49004e457818d0de1a0
+size 7247764000

ckpt/magi/24B_base/inference_weight/model-00003-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cefe5c35cbf553d714c04f3e4b017ccf93a472c9dfa35763e09f3626ca7822b
+size 19327358992

ckpt/magi/24B_base/inference_weight/model-00004-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c063bb74aae9a4982f69a7c72dd4bcf84d6ecece1fa74657adab47a7bef63081
+size 9663682528

ckpt/magi/24B_base/inference_weight/model-00005-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:151c2f417ae3ce95e9f73604313c830f2035de69d131cb39ccb4b5a188415568
+size 3623890200

ckpt/magi/24B_base/inference_weight/model-00006-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f83d1a2cf25923f6f38aad7f4a611dc33c972c8b0cb3b2151e0aeebed363d89a
+size 3028420248

ckpt/magi/24B_base/inference_weight/model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/magi/24B_distill/inference_weight.distill/model-00001-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:351f466d82f932b95d56f2c81e5a3310a45ce90f0d862b65c7a3ed74678133a6
+size 4988160184

ckpt/magi/24B_distill/inference_weight.distill/model-00002-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d68ac40a3236ad1145f05116734af056ce9fc1dc93f1f4e7d530a67af1bad9c
+size 7247764000

ckpt/magi/24B_distill/inference_weight.distill/model-00003-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f19da7622e331aeb937da8c388d55ac4aa518ab0d91ce4fc7091fb2b5787187
+size 19327358992

ckpt/magi/24B_distill/inference_weight.distill/model-00004-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49304615535a77c1ebeb8fb0468578f710886eae6591ca0e70e50516d8813233
+size 9663682528

ckpt/magi/24B_distill/inference_weight.distill/model-00005-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2b97e49e10350d0080861d2b2e394f555ba7ea9f00a208b8d8aa2104739bdae
+size 4831856536

ckpt/magi/24B_distill/inference_weight.distill/model-00006-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c6a7239aebb1dd5213c1d6183830fc4c46b0630411ededa21092fe32ac609e2
+size 1820453696

ckpt/magi/24B_distill/inference_weight.distill/model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/magi/24B_distill_quant/inference_weight.fp8.distill/model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ffecb6e1191507fbe9d57d06cc274b3c50590c548f7a453bcdd69172275062c
+size 9836542920

ckpt/magi/24B_distill_quant/inference_weight.fp8.distill/model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f7038c048711a35071b110c46dbb1ed55a9937213c5712171423cff490b9286
+size 9973482972

ckpt/magi/24B_distill_quant/inference_weight.fp8.distill/model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d16af830c3357dc84e22ee00e8b898049ecdfbb8c2e6d7eb9adcd5e3f3c6eae9
+size 6664159332

ckpt/magi/24B_distill_quant/inference_weight.fp8.distill/model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpt/t5/t5-v1_1-xxl/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "google/t5-v1_1-xxl",
+  "architectures": [
+    "T5EncoderModel"
+  ],
+  "d_ff": 10240,
+  "d_kv": 64,
+  "d_model": 4096,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 24,
+  "num_heads": 64,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.1",
+  "use_cache": true,
+  "vocab_size": 32128
+}

ckpt/t5/t5-v1_1-xxl/pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f71ad0624095dae788b1023081dda1b4040bd24f7244a5b5b46eebc09825839
+size 9452285635

ckpt/t5/t5-v1_1-xxl/pytorch_model-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f68f80678299ac59f69b3550ebd47b966571920d8f9e71f42ab61fabaaed868
+size 9597031749

ckpt/t5/t5-v1_1-xxl/pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,227 @@

+{
+  "metadata": {
+    "total_size": 19575627776
+  },
+  "weight_map": {
+    "encoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "shared.weight": "pytorch_model-00001-of-00002.bin"
+  }
+}

ckpt/t5/t5-v1_1-xxl/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"]}

ckpt/t5/t5-v1_1-xxl/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

ckpt/t5/t5-v1_1-xxl/t5-v1_1-xxl/pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe6002bb05bfd4d2e2dc79468b6320aff4ae6798589bb88129f64294a883b558
+size 5019090944

ckpt/t5/t5-v1_1-xxl/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "model_max_length": 512, "name_or_path": "t5-small"}

ckpt/vae/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_class_name": "ViTVAE",
+  "_diffusers_version": "0.28.2",
+  "ddconfig": {
+    "conv_last_layer": true,
+    "depth": 24,
+    "double_z": true,
+    "embed_dim": 1024,
+    "in_chans": 3,
+    "ln_in_attn": true,
+    "mlp_ratio": 4,
+    "norm_code": false,
+    "num_heads": 16,
+    "patch_length": 4,
+    "patch_size": 8,
+    "qkv_bias": true,
+    "video_length": 16,
+    "video_size": 256,
+    "z_chans": 16
+  },
+  "model_type": "vit"
+}

ckpt/vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5092a7bcd112b7a743235bddad17d30b497da48b70eae51c5340bae8294b761
+size 2455072868

figures/algorithm.png ADDED Viewed

Git LFS Details

SHA256: 7a44e9b01116d3207d8e190119464e1e49cf62d4ad67acd30767bc6984724e95
Pointer size: 132 Bytes
Size of remote file: 3.56 MB

figures/dit_architecture.png ADDED Viewed

Git LFS Details

SHA256: 1acbcc40f77b3167246ed1a734c9a3aa8566d7035765ab1bede654044443bd61
Pointer size: 131 Bytes
Size of remote file: 248 kB

figures/inhouse_human_evaluation.png ADDED Viewed

Git LFS Details

SHA256: 657aa4a189f7db325a5acc967fad6b40ad22d55855ecbe038f27235abf9be3aa
Pointer size: 131 Bytes
Size of remote file: 304 kB

figures/logo_black.png ADDED Viewed