Spaces:

NTUST-DDRC
/

cosmos_transfer1_av

Paused

App Files Files Community

harry900000 commited on 24 days ago

Commit

74308ee

1 Parent(s): 4953ce6

make chunking size as a function argument & add a slider to control it

Browse files

Files changed (4) hide show

app.py +5 -3
cosmos_transfer1/diffusion/inference/inference_utils.py +2 -2
cosmos_transfer1/diffusion/inference/world_generation_pipeline.py +2 -2
cosmos_transfer1/diffusion/model/model_v2w.py +10 -13

app.py CHANGED Viewed

@@ -296,7 +296,7 @@ def generate_video(
     negative_prompt="The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.",  # noqa: E501
     seed=42,
     randomize_seed=False,
-    chunking=False,
     progress=gr.Progress(track_tqdm=True),
 ):
     _dt = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(hours=8))).strftime("%Y-%m-%d_%H.%M.%S")
@@ -338,6 +338,8 @@ def generate_video(
     watcher = watch_gpu_memory(10, lambda x: log.debug(f"GPU memory usage: {x} (MiB)"))
     # start inference
     videos, prompts = inference(args, control_inputs, chunking)
     # print the generation time
@@ -386,7 +388,7 @@ with gr.Blocks() as demo:
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize Seed", value=False)
                 seed_input = gr.Slider(minimum=0, maximum=1000000, value=1, step=1, label="Seed")
-            chunking_checkbox = gr.Checkbox(label="Chunking", value=True)
             generate_button = gr.Button("Generate Image")
         with gr.Column():
@@ -403,7 +405,7 @@ with gr.Blocks() as demo:
             negative_prompt_input,
             seed_input,
             randomize_seed_checkbox,
-            chunking_checkbox,
         ],
         outputs=[output_video, output_file, seed_input],
     )

     negative_prompt="The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.",  # noqa: E501
     seed=42,
     randomize_seed=False,
+    chunking=None,
     progress=gr.Progress(track_tqdm=True),
 ):
     _dt = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(hours=8))).strftime("%Y-%m-%d_%H.%M.%S")
     watcher = watch_gpu_memory(10, lambda x: log.debug(f"GPU memory usage: {x} (MiB)"))
     # start inference
+    if chunking <= 0:
+        chunking = None
     videos, prompts = inference(args, control_inputs, chunking)
     # print the generation time
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize Seed", value=False)
                 seed_input = gr.Slider(minimum=0, maximum=1000000, value=1, step=1, label="Seed")
+            chunking_input = gr.Slider(minimum=0, maximum=121, value=4, step=1, label="Chunking size")
             generate_button = gr.Button("Generate Image")
         with gr.Column():
             negative_prompt_input,
             seed_input,
             randomize_seed_checkbox,
+            chunking_input,
         ],
         outputs=[output_video, output_file, seed_input],
     )

cosmos_transfer1/diffusion/inference/inference_utils.py CHANGED Viewed

@@ -710,7 +710,7 @@ def generate_world_from_control(
     x_sigma_max=None,
     augment_sigma=None,
     use_batch_processing: bool = True,
-    chunking: bool = False,
 ) -> Tuple[np.array, list, list]:
     """Generate video using a conditioning video/image input.
@@ -724,7 +724,7 @@ def generate_world_from_control(
         seed (int): Random seed for generation
         condition_latent (torch.Tensor): Latent tensor from conditioning video/image file
         num_input_frames (int): Number of input frames
-        chunking: Whether to use the chunking method in generation pipeline
     Returns:
         np.array: Generated video frames in shape [T,H,W,C], range [0,255]

     x_sigma_max=None,
     augment_sigma=None,
     use_batch_processing: bool = True,
+    chunking: Optional[int] = None,
 ) -> Tuple[np.array, list, list]:
     """Generate video using a conditioning video/image input.
         seed (int): Random seed for generation
         condition_latent (torch.Tensor): Latent tensor from conditioning video/image file
         num_input_frames (int): Number of input frames
+        chunking: Chunking size, if None, chunking is disabled
     Returns:
         np.array: Generated video frames in shape [T,H,W,C], range [0,255]

cosmos_transfer1/diffusion/inference/world_generation_pipeline.py CHANGED Viewed

@@ -151,7 +151,7 @@ class DiffusionControl2WorldGenerationPipeline(BaseWorldGenerationPipeline):
         regional_prompts: List[str] = None,
         region_definitions: Union[List[List[float]], torch.Tensor] = None,
         waymo_example: bool = False,
-        chunking: bool = False,
     ):
         """Initialize diffusion world generation pipeline.
@@ -179,7 +179,7 @@ class DiffusionControl2WorldGenerationPipeline(BaseWorldGenerationPipeline):
             offload_prompt_upsampler: Whether to offload prompt upsampler after use
             process_group: Process group for distributed training
             waymo_example: Whether to use the waymo example post-training checkpoint
-            chunking: Whether to use the chunking method in generation pipeline
         """
         self.num_input_frames = num_input_frames
         self.control_inputs = control_inputs

         regional_prompts: List[str] = None,
         region_definitions: Union[List[List[float]], torch.Tensor] = None,
         waymo_example: bool = False,
+        chunking: Optional[int] = None,
     ):
         """Initialize diffusion world generation pipeline.
             offload_prompt_upsampler: Whether to offload prompt upsampler after use
             process_group: Process group for distributed training
             waymo_example: Whether to use the waymo example post-training checkpoint
+            chunking: Chunking size, if None, chunking is disabled
         """
         self.num_input_frames = num_input_frames
         self.control_inputs = control_inputs

cosmos_transfer1/diffusion/model/model_v2w.py CHANGED Viewed

@@ -168,19 +168,18 @@ class DiffusionV2WModel(DiffusionT2WModel):
             x0_pred_replaced=x0_pred_replaced,
         )
-    CHUNKING_SIZE = 4
     CHUNKING_MODE = "rand_order"  # ["shuffle", "in_order", "rand_order"]
     IS_STAGGERED = True
-    def get_chunks_indices(self, total_flen) -> List[torch.Tensor]:
         chunks_indices = []
         if self.CHUNKING_MODE == "shuffle":
-            for index in torch.arange(0, total_flen, 1).split(self.CHUNKING_SIZE):
                 chunks_indices.append(index)
             np.random.shuffle(chunks_indices)
         else:
             first_chunk_end = (
-                int(torch.randint(low=0, high=self.CHUNKING_SIZE, size=(1,)) + 1) if self.IS_STAGGERED else self.CHUNKING_SIZE
             )
             if first_chunk_end >= total_flen:
@@ -188,7 +187,7 @@ class DiffusionV2WModel(DiffusionT2WModel):
             else:
                 chunks_indices.append(torch.arange(first_chunk_end))
-                for index in torch.arange(first_chunk_end, total_flen, 1).split(self.CHUNKING_SIZE):
                     chunks_indices.append(index)
                 if self.CHUNKING_MODE == "in_order":
@@ -216,7 +215,7 @@ class DiffusionV2WModel(DiffusionT2WModel):
         add_input_frames_guidance: bool = False,
         x_sigma_max: Optional[torch.Tensor] = None,
         sigma_max: Optional[float] = None,
-        chunking: bool = False,
         **kwargs,
     ) -> Tensor:
         """Generates video samples conditioned on input frames.
@@ -234,7 +233,7 @@ class DiffusionV2WModel(DiffusionT2WModel):
             condition_video_augment_sigma_in_inference: Noise level for condition augmentation
             add_input_frames_guidance: Whether to apply guidance to input frames
             x_sigma_max: Maximum noise level tensor
-            chunking: Whether to use the chunking method in generation pipeline
         Returns:
             Generated video samples tensor
@@ -294,7 +293,7 @@ class DiffusionV2WModel(DiffusionT2WModel):
         condition_video_augment_sigma_in_inference: float = None,
         add_input_frames_guidance: bool = False,
         seed: int = 1,
-        chunking: bool = False,
     ) -> Callable:
         """Creates denoising function for conditional video generation.
@@ -307,12 +306,12 @@ class DiffusionV2WModel(DiffusionT2WModel):
             condition_video_augment_sigma_in_inference: Noise level for condition augmentation
             add_input_frames_guidance: Whether to apply guidance to input frames
             seed: Random seed for reproducibility
-            chunking: Whether to use the chunking method in generation pipeline
         Returns:
             Function that takes noisy input and noise level and returns denoised prediction
         """
-        if not chunking:
             if is_negative_prompt:
                 condition, uncondition = self.conditioner.get_condition_with_negative_prompt(data_batch)
             else:
@@ -348,8 +347,6 @@ class DiffusionV2WModel(DiffusionT2WModel):
             return x0_fn
         else:
-            log.critical("GO CHUNKING !!!")
             def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
                 if is_negative_prompt:
                     condition, uncondition = self.conditioner.get_condition_with_negative_prompt(data_batch)
@@ -358,7 +355,7 @@ class DiffusionV2WModel(DiffusionT2WModel):
                 noises = torch.zeros_like(condition_latent)
                 T = condition_latent.shape[2]
-                for chunk_idx in self.get_chunks_indices(T):
                     latents_ = condition_latent[:, :, chunk_idx, :, :]
                     log.info(f"chunk_idx: {chunk_idx}, chunk shape: {latents_.shape}")
                     # controlnet_cond_ = self.controlnet_data[:, chunk_idx]

             x0_pred_replaced=x0_pred_replaced,
         )
     CHUNKING_MODE = "rand_order"  # ["shuffle", "in_order", "rand_order"]
     IS_STAGGERED = True
+    def get_chunks_indices(self, total_flen, chunking_size) -> List[torch.Tensor]:
         chunks_indices = []
         if self.CHUNKING_MODE == "shuffle":
+            for index in torch.arange(0, total_flen, 1).split(chunking_size):
                 chunks_indices.append(index)
             np.random.shuffle(chunks_indices)
         else:
             first_chunk_end = (
+                int(torch.randint(low=0, high=chunking_size, size=(1,)) + 1) if self.IS_STAGGERED else chunking_size
             )
             if first_chunk_end >= total_flen:
             else:
                 chunks_indices.append(torch.arange(first_chunk_end))
+                for index in torch.arange(first_chunk_end, total_flen, 1).split(chunking_size):
                     chunks_indices.append(index)
                 if self.CHUNKING_MODE == "in_order":
         add_input_frames_guidance: bool = False,
         x_sigma_max: Optional[torch.Tensor] = None,
         sigma_max: Optional[float] = None,
+        chunking: Optional[int] = None,
         **kwargs,
     ) -> Tensor:
         """Generates video samples conditioned on input frames.
             condition_video_augment_sigma_in_inference: Noise level for condition augmentation
             add_input_frames_guidance: Whether to apply guidance to input frames
             x_sigma_max: Maximum noise level tensor
+            chunking: Chunking size, if None, chunking is disabled
         Returns:
             Generated video samples tensor
         condition_video_augment_sigma_in_inference: float = None,
         add_input_frames_guidance: bool = False,
         seed: int = 1,
+        chunking: Optional[int] = None,
     ) -> Callable:
         """Creates denoising function for conditional video generation.
             condition_video_augment_sigma_in_inference: Noise level for condition augmentation
             add_input_frames_guidance: Whether to apply guidance to input frames
             seed: Random seed for reproducibility
+            chunking: Chunking size, if None, chunking is disabled
         Returns:
             Function that takes noisy input and noise level and returns denoised prediction
         """
+        if chunking is None:
             if is_negative_prompt:
                 condition, uncondition = self.conditioner.get_condition_with_negative_prompt(data_batch)
             else:
             return x0_fn
         else:
             def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
                 if is_negative_prompt:
                     condition, uncondition = self.conditioner.get_condition_with_negative_prompt(data_batch)
                 noises = torch.zeros_like(condition_latent)
                 T = condition_latent.shape[2]
+                for chunk_idx in self.get_chunks_indices(T, chunking):
                     latents_ = condition_latent[:, :, chunk_idx, :, :]
                     log.info(f"chunk_idx: {chunk_idx}, chunk shape: {latents_.shape}")
                     # controlnet_cond_ = self.controlnet_data[:, chunk_idx]