Spaces:

openfree
/

ginigen-sora

Paused

App Files Files Community

Sapir commited on Oct 8, 2024

Commit

d699d2b

1 Parent(s): bebbcd0

CausalVideoAutoencoder: made neater load_ckpt.

Browse files

Files changed (3) hide show

xora/examples/image_to_video.py +4 -4
xora/examples/text_to_video.py +5 -5
xora/models/autoencoders/causal_video_autoencoder.py +22 -31

xora/examples/image_to_video.py CHANGED Viewed

@@ -19,12 +19,12 @@ vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
 vae_config_path = vae_dir / "config.json"
 with open(vae_config_path, 'r') as f:
     vae_config = json.load(f)
 vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
-vae = CausalVideoAutoencoder.from_pretrained_conf(
-    config=vae_config,
     state_dict=vae_state_dict,
-    torch_dtype=torch.bfloat16
-).cuda()
 # Load UNet (Transformer) from separate mode
 unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"

 vae_config_path = vae_dir / "config.json"
 with open(vae_config_path, 'r') as f:
     vae_config = json.load(f)
+vae = CausalVideoAutoencoder.from_config(vae_config)
 vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
+vae.load_state_dict(
     state_dict=vae_state_dict,
+)
+vae = vae.cuda().to(torch.bfloat16)
 # Load UNet (Transformer) from separate mode
 unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"

xora/examples/text_to_video.py CHANGED Viewed

@@ -10,7 +10,7 @@ import safetensors.torch
 import json
 # Paths for the separate mode directories
-separate_dir = Path("/opt/models/xora-txt2video")
 unet_dir = separate_dir / 'unet'
 vae_dir = separate_dir / 'vae'
 scheduler_dir = separate_dir / 'scheduler'
@@ -20,12 +20,12 @@ vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
 vae_config_path = vae_dir / "config.json"
 with open(vae_config_path, 'r') as f:
     vae_config = json.load(f)
 vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
-vae = CausalVideoAutoencoder.from_pretrained_conf(
-    config=vae_config,
     state_dict=vae_state_dict,
-    torch_dtype=torch.bfloat16
-).cuda()
 # Load UNet (Transformer) from separate mode
 unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"

 import json
 # Paths for the separate mode directories
+separate_dir = Path("/opt/models/xora-img2video")
 unet_dir = separate_dir / 'unet'
 vae_dir = separate_dir / 'vae'
 scheduler_dir = separate_dir / 'scheduler'
 vae_config_path = vae_dir / "config.json"
 with open(vae_config_path, 'r') as f:
     vae_config = json.load(f)
+vae = CausalVideoAutoencoder.from_config(vae_config)
 vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
+vae.load_state_dict(
     state_dict=vae_state_dict,
+)
+vae = vae.cuda().to(torch.bfloat16)
 # Load UNet (Transformer) from separate mode
 unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"

xora/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -41,35 +41,6 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
         return video_vae
-    @classmethod
-    def from_pretrained_conf(cls, config, state_dict, torch_dtype=torch.float32):
-        video_vae = cls.from_config(config)
-        video_vae.to(torch_dtype)
-        per_channel_statistics_prefix = "per_channel_statistics."
-        ckpt_state_dict = {
-            key: value
-            for key, value in state_dict.items()
-            if not key.startswith(per_channel_statistics_prefix)
-        }
-        video_vae.load_state_dict(ckpt_state_dict)
-        data_dict = {
-            key.removeprefix(per_channel_statistics_prefix): value
-            for key, value in state_dict.items()
-            if key.startswith(per_channel_statistics_prefix)
-        }
-        if len(data_dict) > 0:
-            video_vae.register_buffer("std_of_means", data_dict["std-of-means"])
-            video_vae.register_buffer(
-                "mean_of_means",
-                data_dict.get(
-                    "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
-                ),
-            )
-        return video_vae
     @staticmethod
     def from_config(config):
         assert config["_class_name"] == "CausalVideoAutoencoder", "config must have _class_name=CausalVideoAutoencoder"
@@ -155,6 +126,13 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
         return json.dumps(self.config.__dict__)
     def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
         model_keys = set(name for name, _ in self.named_parameters())
         key_mapping = {
@@ -162,9 +140,8 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
             "downsamplers.0": "downsample",
             "upsamplers.0": "upsample",
         }
         converted_state_dict = {}
-        for key, value in state_dict.items():
             for k, v in key_mapping.items():
                 key = key.replace(k, v)
@@ -176,6 +153,20 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
         super().load_state_dict(converted_state_dict, strict=strict)
     def last_layer(self):
         if hasattr(self.decoder, "conv_out"):
             if isinstance(self.decoder.conv_out, nn.Sequential):

         return video_vae
     @staticmethod
     def from_config(config):
         assert config["_class_name"] == "CausalVideoAutoencoder", "config must have _class_name=CausalVideoAutoencoder"
         return json.dumps(self.config.__dict__)
     def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
+        per_channel_statistics_prefix = "per_channel_statistics."
+        ckpt_state_dict = {
+            key: value
+            for key, value in state_dict.items()
+            if not key.startswith(per_channel_statistics_prefix)
+        }
         model_keys = set(name for name, _ in self.named_parameters())
         key_mapping = {
             "downsamplers.0": "downsample",
             "upsamplers.0": "upsample",
         }
         converted_state_dict = {}
+        for key, value in ckpt_state_dict.items():
             for k, v in key_mapping.items():
                 key = key.replace(k, v)
         super().load_state_dict(converted_state_dict, strict=strict)
+        data_dict = {
+            key.removeprefix(per_channel_statistics_prefix): value
+            for key, value in state_dict.items()
+            if key.startswith(per_channel_statistics_prefix)
+        }
+        if len(data_dict) > 0:
+            self.register_buffer("std_of_means", data_dict["std-of-means"])
+            self.register_buffer(
+                "mean_of_means",
+                data_dict.get(
+                    "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
+                ),
+            )
     def last_layer(self):
         if hasattr(self.decoder, "conv_out"):
             if isinstance(self.decoder.conv_out, nn.Sequential):