Text2Video-ZeroTrkn

Runtime error

App Files Files Community

lev1 commited on Apr 6, 2023

Commit

687b293

1 Parent(s): 9adc565

Enabling Token Merging for fast inference

Browse files

Files changed (10) hide show

app.py +4 -3
app_canny.py +6 -1
app_canny_db.py +6 -1
app_pix2pix_video.py +9 -3
app_pose.py +6 -1
app_text_to_video.py +32 -9
gradio_utils.py +7 -7
model.py +30 -7
requirements.txt +1 -0
text_to_video_pipeline.py +15 -60

app.py CHANGED Viewed

@@ -23,7 +23,7 @@ with gr.Blocks(css='style.css') as demo:
         """
         <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
         <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
-            Text2Video-Zero
         </h1>
         <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
         Levon Khachatryan<sup>1*</sup>, Andranik Movsisyan<sup>1*</sup>, Vahram Tadevosyan<sup>1*</sup>, Roberto Henschel<sup>1*</sup>, Zhangyang Wang<sup>1,2</sup>, Shant Navasardyan<sup>1</sup>
@@ -62,7 +62,8 @@ with gr.Blocks(css='style.css') as demo:
         create_demo_canny(model)
     with gr.Tab('Edge Conditional and Dreambooth Specialized'):
         create_demo_canny_db(model)
     gr.HTML(
         """
         <div style="text-align: justify; max-width: 1200px; margin: 20px auto;">
@@ -90,5 +91,5 @@ if on_huggingspace:
     demo.launch(debug=True)
 else:
     _, _, link = demo.queue(api_open=False).launch(
-        file_directories=['temporal'], share=args.public_access or on_huggingspace)
     print(link)

         """
         <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
         <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
+            <a href="https://github.com/Picsart-AI-Research/Text2Video-Zero" style="color:blue;">Text2Video-Zero</a>
         </h1>
         <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
         Levon Khachatryan<sup>1*</sup>, Andranik Movsisyan<sup>1*</sup>, Vahram Tadevosyan<sup>1*</sup>, Roberto Henschel<sup>1*</sup>, Zhangyang Wang<sup>1,2</sup>, Shant Navasardyan<sup>1</sup>
         create_demo_canny(model)
     with gr.Tab('Edge Conditional and Dreambooth Specialized'):
         create_demo_canny_db(model)
+    '''
+    '''
     gr.HTML(
         """
         <div style="text-align: justify; max-width: 1200px; margin: 20px auto;">
     demo.launch(debug=True)
 else:
     _, _, link = demo.queue(api_open=False).launch(
+        file_directories=['temporal'], share=args.public_access)
     print(link)

app_canny.py CHANGED Viewed

@@ -47,7 +47,11 @@ def create_demo(model: Model):
                     watermark = gr.Radio(["Picsart AI Research", "Text2Video-Zero",
                                          "None"], label="Watermark", value='Picsart AI Research')
                     chunk_size = gr.Slider(
-                        label="Chunk size", minimum=2, maximum=16, value=12 if on_huggingspace else 8, step=1, visible=not on_huggingspace)
             with gr.Column():
                 result = gr.Video(label="Generated Video").style(height="auto")
@@ -56,6 +60,7 @@ def create_demo(model: Model):
             prompt,
             chunk_size,
             watermark,
         ]
         gr.Examples(examples=examples,

                     watermark = gr.Radio(["Picsart AI Research", "Text2Video-Zero",
                                          "None"], label="Watermark", value='Picsart AI Research')
                     chunk_size = gr.Slider(
+                        label="Chunk size", minimum=2, maximum=16, value=8, step=1, visible=not on_huggingspace,
+                        info="Number of frames processed at once. Reduce for lower memory usage.")
+                    merging_ratio = gr.Slider(
+                        label="Merging ratio", minimum=0.0, maximum=0.9, step=0.1, value=0.0, visible=not on_huggingspace,
+                        info="Ratio of how many tokens are merged. The higher the more compression (less memory and faster inference).")
             with gr.Column():
                 result = gr.Video(label="Generated Video").style(height="auto")
             prompt,
             chunk_size,
             watermark,
+            merging_ratio,
         ]
         gr.Examples(examples=examples,

app_canny_db.py CHANGED Viewed

@@ -51,7 +51,11 @@ def create_demo(model: Model):
                     watermark = gr.Radio(["Picsart AI Research", "Text2Video-Zero",
                                          "None"], label="Watermark", value='Picsart AI Research')
                     chunk_size = gr.Slider(
-                        label="Chunk size", minimum=2, maximum=16, value=12 if on_huggingspace else 8, step=1, visible=not on_huggingspace)
             with gr.Column():
                 result = gr.Image(label="Generated Video").style(height=400)
@@ -79,6 +83,7 @@ def create_demo(model: Model):
             prompt,
             chunk_size,
             watermark,
         ]
         gr.Examples(examples=examples,

                     watermark = gr.Radio(["Picsart AI Research", "Text2Video-Zero",
                                          "None"], label="Watermark", value='Picsart AI Research')
                     chunk_size = gr.Slider(
+                        label="Chunk size", minimum=2, maximum=16, value=8, step=1, visible=not on_huggingspace,
+                        info="Number of frames processed at once. Reduce for lower memory usage.")
+                    merging_ratio = gr.Slider(
+                        label="Merging ratio", minimum=0.0, maximum=0.9, step=0.1, value=0.0, visible=not on_huggingspace,
+                        info="Ratio of how many tokens are merged. The higher the more compression (less memory and faster inference).")
             with gr.Column():
                 result = gr.Image(label="Generated Video").style(height=400)
             prompt,
             chunk_size,
             watermark,
+            merging_ratio,
         ]
         gr.Examples(examples=examples,

app_pix2pix_video.py CHANGED Viewed

@@ -48,9 +48,10 @@ def create_demo(model: Model):
                                                  value=512,
                                                  step=64)
                     seed = gr.Slider(label='Seed',
-                                     minimum=0,
                                      maximum=65536,
                                      value=0,
                                      step=1)
                     image_guidance = gr.Slider(label='Image guidance scale',
                                                minimum=0.5,
@@ -73,7 +74,11 @@ def create_demo(model: Model):
                                         value=-1,
                                         step=1)
                     chunk_size = gr.Slider(
-                        label="Chunk size", minimum=2, maximum=16, value=12 if on_huggingspace else 8, step=1, visible=not on_huggingspace)
             with gr.Column():
                 result = gr.Video(label='Output', show_label=True)
         inputs = [
@@ -86,7 +91,8 @@ def create_demo(model: Model):
             end_t,
             out_fps,
             chunk_size,
-            watermark
         ]
         gr.Examples(examples=examples,

                                                  value=512,
                                                  step=64)
                     seed = gr.Slider(label='Seed',
+                                     minimum=-1,
                                      maximum=65536,
                                      value=0,
+                                     info="-1 for random seed on each run. Otherwise the seed will be fixed",
                                      step=1)
                     image_guidance = gr.Slider(label='Image guidance scale',
                                                minimum=0.5,
                                         value=-1,
                                         step=1)
                     chunk_size = gr.Slider(
+                        label="Chunk size", minimum=2, maximum=16, value=8, step=1, visible=not on_huggingspace,
+                        info="Number of frames processed at once. Reduce for lower memory usage.")
+                    merging_ratio = gr.Slider(
+                        label="Merging ratio", minimum=0.0, maximum=0.9, step=0.1, value=0.0, visible=not on_huggingspace,
+                        info="Ratio of how many tokens are merged. The higher the more compression (less memory and faster inference).")
             with gr.Column():
                 result = gr.Video(label='Output', show_label=True)
         inputs = [
             end_t,
             out_fps,
             chunk_size,
+            watermark,
+            merging_ratio
         ]
         gr.Examples(examples=examples,

app_pose.py CHANGED Viewed

@@ -35,7 +35,11 @@ def create_demo(model: Model):
                     watermark = gr.Radio(["Picsart AI Research", "Text2Video-Zero",
                                          "None"], label="Watermark", value='Picsart AI Research')
                     chunk_size = gr.Slider(
-                        label="Chunk size", minimum=2, maximum=16, value=12 if on_huggingspace else 8, step=1, visible=not on_huggingspace)
             with gr.Column():
                 result = gr.Image(label="Generated Video")
@@ -48,6 +52,7 @@ def create_demo(model: Model):
             prompt,
             chunk_size,
             watermark,
         ]
         gr.Examples(examples=examples,

                     watermark = gr.Radio(["Picsart AI Research", "Text2Video-Zero",
                                          "None"], label="Watermark", value='Picsart AI Research')
                     chunk_size = gr.Slider(
+                        label="Chunk size", minimum=2, maximum=16, value=8, step=1, visible=not on_huggingspace,
+                        info="Number of frames processed at once. Reduce for lower memory usage.")
+                    merging_ratio = gr.Slider(
+                        label="Merging ratio", minimum=0.0, maximum=0.9, step=0.1, value=0.0, visible=not on_huggingspace,
+                        info="Ratio of how many tokens are merged. The higher the more compression (less memory and faster inference).")
             with gr.Column():
                 result = gr.Image(label="Generated Video")
             prompt,
             chunk_size,
             watermark,
+            merging_ratio,
         ]
         gr.Examples(examples=examples,

app_text_to_video.py CHANGED Viewed

@@ -39,6 +39,7 @@ def create_demo(model: Model):
                     label="Model",
                     choices=get_model_list(),
                     value="dreamlike-art/dreamlike-photoreal-2.0",
                 )
                 prompt = gr.Textbox(label='Prompt')
                 run_button = gr.Button(label='Run')
@@ -52,21 +53,41 @@ def create_demo(model: Model):
                     else:
                         video_length = gr.Number(
                             label="Video length", value=8, precision=0)
-                    chunk_size = gr.Slider(
-                        label="Chunk size", minimum=2, maximum=16, value=12 if on_huggingspace else 8, step=1, visible=not on_huggingspace)
                     motion_field_strength_x = gr.Slider(
-                        label='Global Translation $\delta_{x}$', minimum=-20, maximum=20, value=12, step=1)
                     motion_field_strength_y = gr.Slider(
-                        label='Global Translation $\delta_{y}$', minimum=-20, maximum=20, value=12, step=1)
                     t0 = gr.Slider(label="Timestep t0", minimum=0,
-                                   maximum=49, value=44, step=1)
-                    t1 = gr.Slider(label="Timestep t1", minimum=0,
-                                   maximum=49, value=47, step=1)
-                    n_prompt = gr.Textbox(
-                        label="Optional Negative Prompt", value='')
             with gr.Column():
                 result = gr.Video(label="Generated Video")
@@ -81,6 +102,8 @@ def create_demo(model: Model):
             chunk_size,
             video_length,
             watermark,
         ]
         gr.Examples(examples=examples,

                     label="Model",
                     choices=get_model_list(),
                     value="dreamlike-art/dreamlike-photoreal-2.0",
                 )
                 prompt = gr.Textbox(label='Prompt')
                 run_button = gr.Button(label='Run')
                     else:
                         video_length = gr.Number(
                             label="Video length", value=8, precision=0)
+                    n_prompt = gr.Textbox(
+                        label="Optional Negative Prompt", value='')
+                    seed = gr.Slider(label='Seed',
+                                     info="-1 for random seed on each run. Otherwise, the seed will be fixed.",
+                                     minimum=-1,
+                                     maximum=65536,
+                                     value=0,
+                                     step=1)
                     motion_field_strength_x = gr.Slider(
+                        label='Global Translation $\\delta_{x}$', minimum=-20, maximum=20,
+                        value=12,
+                        step=1)
                     motion_field_strength_y = gr.Slider(
+                        label='Global Translation $\\delta_{y}$', minimum=-20, maximum=20,
+                        value=12,
+                        step=1)
                     t0 = gr.Slider(label="Timestep t0", minimum=0,
+                                   maximum=47, value=44, step=1,
+                                   info="Perform DDPM steps from t0 to t1. The larger the gap between t0 and t1, the more variance between the frames. Ensure t0 < t1 ",
+                                   )
+                    t1 = gr.Slider(label="Timestep t1", minimum=1,
+                                   info="Perform DDPM steps from t0 to t1. The larger the gap between t0 and t1, the more variance between the frames. Ensure t0 < t1",
+                                   maximum=48, value=47, step=1)
+                    chunk_size = gr.Slider(
+                        label="Chunk size", minimum=2, maximum=16, value=8, step=1, visible=not on_huggingspace,
+                        info="Number of frames processed at once. Reduce for lower memory usage."
+                    )
+                    merging_ratio = gr.Slider(
+                        label="Merging ratio", minimum=0.0, maximum=0.9, step=0.1, value=0.0, visible=not on_huggingspace,
+                        info="Ratio of how many tokens are merged. The higher the more compression (less memory and faster inference)."
+                    )
             with gr.Column():
                 result = gr.Video(label="Generated Video")
             chunk_size,
             video_length,
             watermark,
+            merging_ratio,
+            seed,
         ]
         gr.Examples(examples=examples,

gradio_utils.py CHANGED Viewed

@@ -8,19 +8,19 @@ def edge_path_to_video_path(edge_path):
     vid_name = edge_path.split("/")[-1]
     if vid_name == "butterfly.mp4":
-        video_path = "__assets__/canny_videos_mp4_2fps/butterfly.mp4"
     elif vid_name == "deer.mp4":
-        video_path = "__assets__/canny_videos_mp4_2fps/deer.mp4"
     elif vid_name == "fox.mp4":
-        video_path = "__assets__/canny_videos_mp4_2fps/fox.mp4"
     elif vid_name == "girl_dancing.mp4":
-        video_path = "__assets__/canny_videos_mp4_2fps/girl_dancing.mp4"
     elif vid_name == "girl_turning.mp4":
-        video_path = "__assets__/canny_videos_mp4_2fps/girl_turning.mp4"
     elif vid_name == "halloween.mp4":
-        video_path = "__assets__/canny_videos_mp4_2fps/halloween.mp4"
     elif vid_name == "santa.mp4":
-        video_path = "__assets__/canny_videos_mp4_2fps/santa.mp4"
     assert os.path.isfile(video_path)
     return video_path

     vid_name = edge_path.split("/")[-1]
     if vid_name == "butterfly.mp4":
+        video_path = "__assets__/canny_videos_mp4/butterfly.mp4"
     elif vid_name == "deer.mp4":
+        video_path = "__assets__/canny_videos_mp4/deer.mp4"
     elif vid_name == "fox.mp4":
+        video_path = "__assets__/canny_videos_mp4/fox.mp4"
     elif vid_name == "girl_dancing.mp4":
+        video_path = "__assets__/canny_videos_mp4/girl_dancing.mp4"
     elif vid_name == "girl_turning.mp4":
+        video_path = "__assets__/canny_videos_mp4/girl_turning.mp4"
     elif vid_name == "halloween.mp4":
+        video_path = "__assets__/canny_videos_mp4/halloween.mp4"
     elif vid_name == "santa.mp4":
+        video_path = "__assets__/canny_videos_mp4/santa.mp4"
     assert os.path.isfile(video_path)
     return video_path

model.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from enum import Enum
 import gc
 import numpy as np
 import torch
 from diffusers import StableDiffusionInstructPix2PixPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UNet2DConditionModel
@@ -45,6 +45,7 @@ class Model:
         self.model_type = None
         self.states = {}
     def set_model(self, model_type: ModelType, model_id: str, **kwargs):
         if self.pipe is not None:
@@ -55,6 +56,7 @@ class Model:
         self.pipe = self.pipe_dict[model_type].from_pretrained(
             model_id, safety_checker=safety_checker, **kwargs).to(self.device).to(self.dtype)
         self.model_type = model_type
     def inference_chunk(self, frame_ids, **kwargs):
         if self.pipe is None:
@@ -80,6 +82,13 @@ class Model:
     def inference(self, split_to_chunks=False, chunk_size=8, **kwargs):
         if self.pipe is None:
             return
         seed = kwargs.pop('seed', 0)
         if seed < 0:
             seed = self.generator.seed()
@@ -116,6 +125,7 @@ class Model:
             result = np.concatenate(result)
             return result
         else:
             return self.pipe(prompt=prompt, negative_prompt=negative_prompt, generator=self.generator, **kwargs).images
     def process_controlnet_canny(self,
@@ -123,6 +133,7 @@ class Model:
                                  prompt,
                                  chunk_size=8,
                                  watermark='Picsart AI Research',
                                  num_inference_steps=20,
                                  controlnet_conditioning_scale=1.0,
                                  guidance_scale=9.0,
@@ -133,6 +144,7 @@ class Model:
                                  resolution=512,
                                  use_cf_attn=True,
                                  save_path=None):
         video_path = gradio_utils.edge_path_to_video_path(video_path)
         if self.model_type != ModelType.ControlNetCanny:
             controlnet = ControlNetModel.from_pretrained(
@@ -173,6 +185,7 @@ class Model:
                                 output_type='numpy',
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
                                 )
         return utils.create_video(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
@@ -181,6 +194,7 @@ class Model:
                                 prompt,
                                 chunk_size=8,
                                 watermark='Picsart AI Research',
                                 num_inference_steps=20,
                                 controlnet_conditioning_scale=1.0,
                                 guidance_scale=9.0,
@@ -189,6 +203,7 @@ class Model:
                                 resolution=512,
                                 use_cf_attn=True,
                                 save_path=None):
         video_path = gradio_utils.motion_to_video_path(video_path)
         if self.model_type != ModelType.ControlNetPose:
             controlnet = ControlNetModel.from_pretrained(
@@ -232,6 +247,7 @@ class Model:
                                 output_type='numpy',
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
                                 )
         return utils.create_gif(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
@@ -241,6 +257,7 @@ class Model:
                                     prompt,
                                     chunk_size=8,
                                     watermark='Picsart AI Research',
                                     num_inference_steps=20,
                                     controlnet_conditioning_scale=1.0,
                                     guidance_scale=9.0,
@@ -251,6 +268,7 @@ class Model:
                                     resolution=512,
                                     use_cf_attn=True,
                                     save_path=None):
         db_path = gradio_utils.get_model_from_db_selection(db_path)
         video_path = gradio_utils.get_video_from_canny_selection(video_path)
         # Load db and controlnet weights
@@ -295,6 +313,7 @@ class Model:
                                 output_type='numpy',
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
                                 )
         return utils.create_gif(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
@@ -309,8 +328,10 @@ class Model:
                         out_fps=-1,
                         chunk_size=8,
                         watermark='Picsart AI Research',
                         use_cf_attn=True,
                         save_path=None,):
         if self.model_type != ModelType.Pix2Pix_Video:
             self.set_model(ModelType.Pix2Pix_Video,
                            model_id="timbrooks/instruct-pix2pix")
@@ -330,6 +351,7 @@ class Model:
                                 image_guidance_scale=image_guidance_scale,
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
                                 )
         return utils.create_video(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
@@ -344,17 +366,18 @@ class Model:
                            chunk_size=8,
                            video_length=8,
                            watermark='Picsart AI Research',
-                           inject_noise_to_warp=False,
                            resolution=512,
-                           seed=-1,
                            fps=2,
                            use_cf_attn=True,
                            use_motion_field=True,
                            smooth_bg=False,
                            smooth_bg_strength=0.4,
                            path=None):
-        if self.model_type != ModelType.Text2Video:
             unet = UNet2DConditionModel.from_pretrained(
                 model_name, subfolder="unet")
             self.set_model(ModelType.Text2Video,
@@ -364,7 +387,7 @@ class Model:
             if use_cf_attn:
                 self.pipe.unet.set_attn_processor(
                     processor=self.text2video_attn_proc)
-            self.generator.manual_seed(seed)
         added_prompt = "high quality, HD, 8K, trending on artstation, high focus, dramatic lighting"
         negative_prompts = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic'
@@ -396,7 +419,7 @@ class Model:
                                 seed=seed,
                                 output_type='numpy',
                                 negative_prompt=negative_prompt,
-                                inject_noise_to_warp=inject_noise_to_warp,
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
                                 )

 from enum import Enum
 import gc
 import numpy as np
+import tomesd
 import torch
 from diffusers import StableDiffusionInstructPix2PixPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UNet2DConditionModel
         self.model_type = None
         self.states = {}
+        self.model_name = ""
     def set_model(self, model_type: ModelType, model_id: str, **kwargs):
         if self.pipe is not None:
         self.pipe = self.pipe_dict[model_type].from_pretrained(
             model_id, safety_checker=safety_checker, **kwargs).to(self.device).to(self.dtype)
         self.model_type = model_type
+        self.model_name = model_id
     def inference_chunk(self, frame_ids, **kwargs):
         if self.pipe is None:
     def inference(self, split_to_chunks=False, chunk_size=8, **kwargs):
         if self.pipe is None:
             return
+        tomesd.remove_patch(self.pipe)
+        if "merging_ratio" in kwargs:
+            merging_ratio = kwargs.pop("merging_ratio")
+            if merging_ratio > 0:
+                tomesd.apply_patch(self.pipe, ratio=merging_ratio)
         seed = kwargs.pop('seed', 0)
         if seed < 0:
             seed = self.generator.seed()
             result = np.concatenate(result)
             return result
         else:
+            self.generator.manual_seed(seed)
             return self.pipe(prompt=prompt, negative_prompt=negative_prompt, generator=self.generator, **kwargs).images
     def process_controlnet_canny(self,
                                  prompt,
                                  chunk_size=8,
                                  watermark='Picsart AI Research',
+                                 merging_ratio=0.0,
                                  num_inference_steps=20,
                                  controlnet_conditioning_scale=1.0,
                                  guidance_scale=9.0,
                                  resolution=512,
                                  use_cf_attn=True,
                                  save_path=None):
+        print("Processing Canny")
         video_path = gradio_utils.edge_path_to_video_path(video_path)
         if self.model_type != ModelType.ControlNetCanny:
             controlnet = ControlNetModel.from_pretrained(
                                 output_type='numpy',
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
+                                merging_ratio=merging_ratio,
                                 )
         return utils.create_video(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
                                 prompt,
                                 chunk_size=8,
                                 watermark='Picsart AI Research',
+                                merging_ratio=0.0,
                                 num_inference_steps=20,
                                 controlnet_conditioning_scale=1.0,
                                 guidance_scale=9.0,
                                 resolution=512,
                                 use_cf_attn=True,
                                 save_path=None):
+        print("Processing Pose")
         video_path = gradio_utils.motion_to_video_path(video_path)
         if self.model_type != ModelType.ControlNetPose:
             controlnet = ControlNetModel.from_pretrained(
                                 output_type='numpy',
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
+                                merging_ratio=merging_ratio,
                                 )
         return utils.create_gif(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
                                     prompt,
                                     chunk_size=8,
                                     watermark='Picsart AI Research',
+                                    merging_ratio=0.0,
                                     num_inference_steps=20,
                                     controlnet_conditioning_scale=1.0,
                                     guidance_scale=9.0,
                                     resolution=512,
                                     use_cf_attn=True,
                                     save_path=None):
+        print("Processing Canny_DB")
         db_path = gradio_utils.get_model_from_db_selection(db_path)
         video_path = gradio_utils.get_video_from_canny_selection(video_path)
         # Load db and controlnet weights
                                 output_type='numpy',
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
+                                merging_ratio=merging_ratio,
                                 )
         return utils.create_gif(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
                         out_fps=-1,
                         chunk_size=8,
                         watermark='Picsart AI Research',
+                        merging_ratio=0.0,
                         use_cf_attn=True,
                         save_path=None,):
+        print("Processing Pix2Pix")
         if self.model_type != ModelType.Pix2Pix_Video:
             self.set_model(ModelType.Pix2Pix_Video,
                            model_id="timbrooks/instruct-pix2pix")
                                 image_guidance_scale=image_guidance_scale,
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
+                                merging_ratio=merging_ratio
                                 )
         return utils.create_video(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
                            chunk_size=8,
                            video_length=8,
                            watermark='Picsart AI Research',
+                           merging_ratio=0.0,
+                           seed=0,
                            resolution=512,
                            fps=2,
                            use_cf_attn=True,
                            use_motion_field=True,
                            smooth_bg=False,
                            smooth_bg_strength=0.4,
                            path=None):
+        print("Processing Text2Video")
+        if self.model_type != ModelType.Text2Video or model_name != self.model_name:
+            print("Model update")
             unet = UNet2DConditionModel.from_pretrained(
                 model_name, subfolder="unet")
             self.set_model(ModelType.Text2Video,
             if use_cf_attn:
                 self.pipe.unet.set_attn_processor(
                     processor=self.text2video_attn_proc)
+        self.generator.manual_seed(seed)
         added_prompt = "high quality, HD, 8K, trending on artstation, high focus, dramatic lighting"
         negative_prompts = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic'
                                 seed=seed,
                                 output_type='numpy',
                                 negative_prompt=negative_prompt,
+                                merging_ratio=merging_ratio,
                                 split_to_chunks=True,
                                 chunk_size=chunk_size,
                                 )

requirements.txt CHANGED Viewed

@@ -34,3 +34,4 @@ yapf==0.32.0
 safetensors==0.2.7
 beautifulsoup4
 bs4

 safetensors==0.2.7
 beautifulsoup4
 bs4
+tomesd

text_to_video_pipeline.py CHANGED Viewed

@@ -53,8 +53,10 @@ class TextToVideoPipeline(StableDiffusionPipeline):
         if x0 is None:
             return torch.randn(shape, generator=generator, device=rand_device, dtype=text_embeddings.dtype).to(device)
         else:
-            eps = torch.randn_like(x0, dtype=text_embeddings.dtype).to(device)
             alpha_vec = torch.prod(self.scheduler.alphas[t0:tMax])
             xt = torch.sqrt(alpha_vec) * x0 + \
                 torch.sqrt(1-alpha_vec) * eps
             return xt
@@ -89,7 +91,7 @@ class TextToVideoPipeline(StableDiffusionPipeline):
         latents = latents * self.scheduler.init_noise_sigma
         return latents
-    def warp_latents_independently(self, latents, reference_flow, inject_noise=False):
         _, _, H, W = reference_flow.size()
         b, _, f, h, w = latents.size()
         assert b == 1
@@ -109,15 +111,6 @@ class TextToVideoPipeline(StableDiffusionPipeline):
         warped = grid_sample(latents_0, coords_t0,
                              mode='nearest', padding_mode='reflection')
-        if inject_noise:
-            idx = torch.logical_or(coords_t0 >= 1, coords_t0 < -1)
-            reset_noise = torch.randn(idx.shape)
-            idx = torch.logical_or(idx[:, :, :, 0], idx[:, :, :, 1])
-            idx = repeat(idx, "f w h -> f c w h", c=warped.shape[1])
-            reset_noise = torch.randn(
-                size=warped.shape, dtype=warped.dtype, device=warped.device)
-            warped[idx] = reset_noise[idx]
         warped = rearrange(warped, '(b f) c h w -> b c f h w', f=f)
         return warped
@@ -212,20 +205,20 @@ class TextToVideoPipeline(StableDiffusionPipeline):
         reference_flow = torch.zeros(
             (video_length-1, 2, 512, 512), device=latents.device, dtype=latents.dtype)
-        for fr_idx in range(video_length-1):
             reference_flow[fr_idx, 0, :,
-                           :] = motion_field_strength_x*(frame_ids[fr_idx]+1)
             reference_flow[fr_idx, 1, :,
-                           :] = motion_field_strength_y*(frame_ids[fr_idx]+1)
         return reference_flow
-    def create_motion_field_and_warp_latents(self, motion_field_strength_x, motion_field_strength_y, frame_ids, video_length, inject_noise_to_warp, latents):
         motion_field = self.create_motion_field(motion_field_strength_x=motion_field_strength_x,
                                                 motion_field_strength_y=motion_field_strength_y, latents=latents, video_length=video_length, frame_ids=frame_ids)
         for idx, latent in enumerate(latents):
             latents[idx] = self.warp_latents_independently(
-                latent[None], motion_field, inject_noise=inject_noise_to_warp)
         return motion_field, latents
     @torch.no_grad()
@@ -255,13 +248,12 @@ class TextToVideoPipeline(StableDiffusionPipeline):
         use_motion_field: bool = True,
         smooth_bg: bool = False,
         smooth_bg_strength: float = 0.4,
-        inject_noise_to_warp: bool = False,
         t0: int = 44,
         t1: int = 47,
         **kwargs,
     ):
         frame_ids = kwargs.pop("frame_ids", list(range(video_length)))
         assert num_videos_per_prompt == 1
         assert isinstance(prompt, list) and len(prompt) > 0
         assert isinstance(negative_prompt, list) or negative_prompt is None
@@ -280,11 +272,6 @@ class TextToVideoPipeline(StableDiffusionPipeline):
         prompt = prompt_types[0]
         negative_prompt = prompt_types[1]
-        print(
-            f" Motion field strength x = {motion_field_strength_x}, y = {motion_field_strength_y}")
-        print(f" Use: Motion field = {use_motion_field}")
-        print(f" Use: Background smoothing = {smooth_bg}")
-        print(f"Inject noise to warp =  {inject_noise_to_warp}")
         # Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -355,6 +342,7 @@ class TextToVideoPipeline(StableDiffusionPipeline):
         t0 = timesteps_ddpm[t0]
         t1 = timesteps_ddpm[t1]
         print(f"t0 = {t0} t1 = {t1}")
         x_t1_1 = None
@@ -366,14 +354,6 @@ class TextToVideoPipeline(StableDiffusionPipeline):
         shape = (batch_size, num_channels_latents, 1, height //
                  self.vae_scale_factor, width // self.vae_scale_factor)
-        if inject_noise_to_warp and use_motion_field:
-            # if we inject to noise to warp function, we do it for timesteps T = 1000
-            x_t0_k = xT[:, :, :1, :, :].repeat(1, 1, video_length-1, 1, 1)
-            # reference_flow, x_t0_k = self.create_motion_field_and_warp_latents(motion_field_strength_x=motion_field_strength_x, motion_field_strength_y=motion_field_strength_y,
-            #                                                                   frame_ids=frame_ids,video_length=video_length,inject_noise_to_warp=inject_noise_to_warp,latents = x_t0_k)
-            # xT =torch.cat([xT, x_t0_k], dim=2).clone().detach()
         ddim_res = self.DDIM_backward(num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=1000, t0=t0, t1=t1, do_classifier_free_guidance=do_classifier_free_guidance,
                                       null_embs=null_embs, text_embeddings=text_embeddings, latents_local=xT, latents_dtype=dtype, guidance_scale=guidance_scale, guidance_stop_step=guidance_stop_step,
@@ -387,37 +367,13 @@ class TextToVideoPipeline(StableDiffusionPipeline):
             x_t1_1 = ddim_res["x_t1_1"].detach()
         del ddim_res
         del xT
-        if inject_noise_to_warp and use_motion_field:
-            # DDPM forward to allow for more motion
-            if t1 > t0:
-                x_t1_k = self.DDPM_forward(
-                    x0=x_t0_1, t0=t0, tMax=t1, device=device, shape=shape, text_embeddings=text_embeddings, generator=generator)
-            else:
-                x_t1_k = x_t0_k
-            if x_t1_1 is None:
-                raise Exception
-            x_t1 = x_t1_k.clone().detach()
-            ddim_res = self.DDIM_backward(num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=t1, t0=-1, t1=-1, do_classifier_free_guidance=do_classifier_free_guidance,
-                                          null_embs=null_embs, text_embeddings=text_embeddings, latents_local=x_t1, latents_dtype=dtype, guidance_scale=guidance_scale, guidance_stop_step=guidance_stop_step,
-                                          callback=callback, callback_steps=callback_steps, extra_step_kwargs=extra_step_kwargs, num_warmup_steps=num_warmup_steps)
-            x0 = ddim_res["x0"].detach()
-            del ddim_res
-            del x_t1
-            del x_t1_k
-        if use_motion_field and not inject_noise_to_warp:
             del x0
             x_t0_k = x_t0_1[:, :, :1, :, :].repeat(1, 1, video_length-1, 1, 1)
             reference_flow, x_t0_k = self.create_motion_field_and_warp_latents(
-                motion_field_strength_x=motion_field_strength_x, motion_field_strength_y=motion_field_strength_y, latents=x_t0_k, video_length=video_length,
-                inject_noise_to_warp=inject_noise_to_warp, frame_ids=frame_ids)
             # assuming t0=t1=1000, if t0 = 1000
             if t1 > t0:
@@ -440,7 +396,6 @@ class TextToVideoPipeline(StableDiffusionPipeline):
             del x_t1
             del x_t1_1
             del x_t1_k
         else:
             x_t1 = x_t1_1.clone()
             x_t1_1 = x_t1_1[:, :, :1, :, :].clone()
@@ -481,7 +436,7 @@ class TextToVideoPipeline(StableDiffusionPipeline):
                 if use_motion_field:
                     x_t1_fg_masked_b = x_t1_fg_masked_b[None]
                     x_t1_fg_masked_b = self.warp_latents_independently(
-                        x_t1_fg_masked_b, reference_flow, inject_noise=False)
                 else:
                     x_t1_fg_masked_b = x_t1_fg_masked_b[None]
@@ -499,7 +454,7 @@ class TextToVideoPipeline(StableDiffusionPipeline):
                 m_fg_b = m_fg_1_b.repeat(1, 1, video_length-1, 1, 1)
                 if use_motion_field:
                     m_fg_b = self.warp_latents_independently(
-                        m_fg_b.clone(), reference_flow, inject_noise=False)
                 M_FG_warped.append(
                     torch.cat([m_fg_1_b[:1, 0], m_fg_b[:1, 0]], dim=1))

         if x0 is None:
             return torch.randn(shape, generator=generator, device=rand_device, dtype=text_embeddings.dtype).to(device)
         else:
+            eps = torch.randn(x0.shape, dtype=text_embeddings.dtype, generator=generator,
+                              device=rand_device)
             alpha_vec = torch.prod(self.scheduler.alphas[t0:tMax])
             xt = torch.sqrt(alpha_vec) * x0 + \
                 torch.sqrt(1-alpha_vec) * eps
             return xt
         latents = latents * self.scheduler.init_noise_sigma
         return latents
+    def warp_latents_independently(self, latents, reference_flow):
         _, _, H, W = reference_flow.size()
         b, _, f, h, w = latents.size()
         assert b == 1
         warped = grid_sample(latents_0, coords_t0,
                              mode='nearest', padding_mode='reflection')
         warped = rearrange(warped, '(b f) c h w -> b c f h w', f=f)
         return warped
         reference_flow = torch.zeros(
             (video_length-1, 2, 512, 512), device=latents.device, dtype=latents.dtype)
+        for fr_idx, frame_id in enumerate(frame_ids):
             reference_flow[fr_idx, 0, :,
+                           :] = motion_field_strength_x*(frame_id)
             reference_flow[fr_idx, 1, :,
+                           :] = motion_field_strength_y*(frame_id)
         return reference_flow
+    def create_motion_field_and_warp_latents(self, motion_field_strength_x, motion_field_strength_y, frame_ids, video_length, latents):
         motion_field = self.create_motion_field(motion_field_strength_x=motion_field_strength_x,
                                                 motion_field_strength_y=motion_field_strength_y, latents=latents, video_length=video_length, frame_ids=frame_ids)
         for idx, latent in enumerate(latents):
             latents[idx] = self.warp_latents_independently(
+                latent[None], motion_field)
         return motion_field, latents
     @torch.no_grad()
         use_motion_field: bool = True,
         smooth_bg: bool = False,
         smooth_bg_strength: float = 0.4,
         t0: int = 44,
         t1: int = 47,
         **kwargs,
     ):
         frame_ids = kwargs.pop("frame_ids", list(range(video_length)))
+        assert t0 < t1
         assert num_videos_per_prompt == 1
         assert isinstance(prompt, list) and len(prompt) > 0
         assert isinstance(negative_prompt, list) or negative_prompt is None
         prompt = prompt_types[0]
         negative_prompt = prompt_types[1]
         # Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
         t0 = timesteps_ddpm[t0]
         t1 = timesteps_ddpm[t1]
         print(f"t0 = {t0} t1 = {t1}")
         x_t1_1 = None
         shape = (batch_size, num_channels_latents, 1, height //
                  self.vae_scale_factor, width // self.vae_scale_factor)
         ddim_res = self.DDIM_backward(num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=1000, t0=t0, t1=t1, do_classifier_free_guidance=do_classifier_free_guidance,
                                       null_embs=null_embs, text_embeddings=text_embeddings, latents_local=xT, latents_dtype=dtype, guidance_scale=guidance_scale, guidance_stop_step=guidance_stop_step,
             x_t1_1 = ddim_res["x_t1_1"].detach()
         del ddim_res
         del xT
+        if use_motion_field:
             del x0
             x_t0_k = x_t0_1[:, :, :1, :, :].repeat(1, 1, video_length-1, 1, 1)
             reference_flow, x_t0_k = self.create_motion_field_and_warp_latents(
+                motion_field_strength_x=motion_field_strength_x, motion_field_strength_y=motion_field_strength_y, latents=x_t0_k, video_length=video_length, frame_ids=frame_ids[1:])
             # assuming t0=t1=1000, if t0 = 1000
             if t1 > t0:
             del x_t1
             del x_t1_1
             del x_t1_k
         else:
             x_t1 = x_t1_1.clone()
             x_t1_1 = x_t1_1[:, :, :1, :, :].clone()
                 if use_motion_field:
                     x_t1_fg_masked_b = x_t1_fg_masked_b[None]
                     x_t1_fg_masked_b = self.warp_latents_independently(
+                        x_t1_fg_masked_b, reference_flow)
                 else:
                     x_t1_fg_masked_b = x_t1_fg_masked_b[None]
                 m_fg_b = m_fg_1_b.repeat(1, 1, video_length-1, 1, 1)
                 if use_motion_field:
                     m_fg_b = self.warp_latents_independently(
+                        m_fg_b.clone(), reference_flow)
                 M_FG_warped.append(
                     torch.cat([m_fg_1_b[:1, 0], m_fg_b[:1, 0]], dim=1))