Spaces:

zerogpu-aoti
/

ltx-dev-fast

Running on Zero

App Files Files Community

sayakpaul HF Staff commited on 10 days ago

Commit

10bbb52

1 Parent(s): 5460f04

up

Browse files

Files changed (5) hide show

check.py +82 -0
optimization.py +21 -14
optimization_utils.py +43 -4
reproduce.py +117 -0
requirements.txt +0 -11

check.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from diffusers import LTXConditionPipeline
+from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
+import numpy as np
+from PIL import Image
+import torch
+from diffusers.utils import load_image, load_video, export_to_video
+from optimization import optimize_pipeline_
+MODEL_ID = "Lightricks/LTX-Video-0.9.8-13B-distilled"
+LANDSCAPE_WIDTH = 480
+LANDSCAPE_HEIGHT = 832
+MAX_SEED = np.iinfo(np.int32).max
+FIXED_FPS = 24
+MIN_FRAMES_MODEL = 8
+MAX_FRAMES_MODEL = 96
+MIN_DURATION = round(MIN_FRAMES_MODEL / FIXED_FPS, 1)
+MAX_DURATION = round(MAX_FRAMES_MODEL / FIXED_FPS, 1)
+def resize_image(image: Image.Image) -> Image.Image:
+    if image.height > image.width:
+        transposed = image.transpose(Image.Transpose.ROTATE_90)
+        resized = resize_image_landscape(transposed)
+        return resized.transpose(Image.Transpose.ROTATE_270)
+    return resize_image_landscape(image)
+def resize_image_landscape(image: Image.Image) -> Image.Image:
+    target_aspect = LANDSCAPE_WIDTH / LANDSCAPE_HEIGHT
+    width, height = image.size
+    in_aspect = width / height
+    if in_aspect > target_aspect:
+        new_width = round(height * target_aspect)
+        left = (width - new_width) // 2
+        image = image.crop((left, 0, left + new_width, height))
+    else:
+        new_height = round(width / target_aspect)
+        top = (height - new_height) // 2
+        image = image.crop((0, top, width, top + new_height))
+    return image.resize((LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT), Image.LANCZOS)
+pipe = LTXConditionPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda")
+dummy_image = Image.new("RGB", (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT))
+video = load_video(export_to_video([dummy_image]))
+condition1 = LTXVideoCondition(video=video, frame_index=0)
+optimize_pipeline_(
+    pipe,
+    conditions=[condition1],
+    prompt="prompt",
+    height=LANDSCAPE_HEIGHT,
+    width=LANDSCAPE_WIDTH,
+    num_frames=MAX_FRAMES_MODEL,
+    num_inference_steps=2
+)
+default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
+default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
+input_image = load_image("peng.png")
+duration_seconds = MAX_DURATION
+guidance_scale = 1.0
+num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
+current_seed = 42
+resized_image = resize_image(input_image)
+steps = 8
+video = load_video(export_to_video([resized_image]))
+condition1 = LTXVideoCondition(video=video, frame_index=0)
+output_frames_list = pipe(
+    conditions=[condition1],
+    prompt=default_prompt_i2v,
+    negative_prompt=default_negative_prompt,
+    height=resized_image.height,
+    width=resized_image.width,
+    num_frames=num_frames,
+    guidance_scale=float(guidance_scale),
+    num_inference_steps=int(steps),
+    generator=torch.Generator(device="cuda").manual_seed(current_seed),
+).frames[0]

optimization.py CHANGED Viewed

@@ -9,11 +9,9 @@ from typing import ParamSpec
 import spaces
 import torch
 from torch.utils._pytree import tree_map_only
-from torchao.quantization import quantize_
-from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
 from diffusers import LTXConditionPipeline
-from optimization_utils import capture_component_call
-from optimization_utils import aoti_compile
 P = ParamSpec("P")
@@ -33,7 +31,8 @@ INDUCTOR_CONFIGS = {
     "epilogue_fusion": False,
     "coordinate_descent_tuning": True,
     "coordinate_descent_check_all_directions": True,
-    "max_autotune": True,
     "triton.cudagraphs": True,
 }
 TRANSFORMER_SPATIAL_PATCH_SIZE = 1
@@ -50,15 +49,15 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
     latent_height = height // VAE_SPATIAL_COMPRESSION_RATIO
     latent_width = width // VAE_SPATIAL_COMPRESSION_RATIO
     @spaces.GPU(duration=1500)
     def compile_transformer():
-        with capture_component_call(pipeline, "transformer") as call:
-            pipeline(*args, **kwargs)
-        dynamic_shapes = tree_map_only((torch.Tensor, bool), lambda t: None, call.kwargs)
         # dynamic_shapes |= TRANSFORMER_DYNAMIC_SHAPES
-        quantize_(pipeline.transformer, Float8DynamicActivationFloat8WeightConfig())
         hidden_states: torch.Tensor = call.kwargs["hidden_states"]
         unpacked_hidden_states = LTXConditionPipeline._unpack_latents(
@@ -88,14 +87,13 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
             mod=pipeline.transformer,
             args=call.args,
             kwargs=call.kwargs | {"hidden_states": hidden_states_landscape},
-            dynamic_shapes=dynamic_shapes,
         )
         exported_portrait = torch.export.export(
             mod=pipeline.transformer,
             args=call.args,
             kwargs=call.kwargs | {"hidden_states": hidden_states_portrait},
-            dynamic_shapes=dynamic_shapes,
         )
         compiled_landscape = aoti_compile(exported_landscape, INDUCTOR_CONFIGS)
@@ -108,6 +106,7 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
     compiled_landscape, compiled_portrait = compile_transformer()
     def combined_transformer(*args, **kwargs):
         hidden_states: torch.Tensor = kwargs["hidden_states"]
         unpacked_hidden_states = LTXConditionPipeline._unpack_latents(
@@ -126,7 +125,15 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
     transformer_config = pipeline.transformer.config
     transformer_dtype = pipeline.transformer.dtype
     cache_context = pipeline.transformer.cache_context
-    pipeline.transformer = combined_transformer
     pipeline.transformer.config = transformer_config  # pyright: ignore[reportAttributeAccessIssue]
     pipeline.transformer.dtype = transformer_dtype  # pyright: ignore[reportAttributeAccessIssue]
     pipeline.transformer.cache_context = cache_context

 import spaces
 import torch
 from torch.utils._pytree import tree_map_only
+from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight
 from diffusers import LTXConditionPipeline
+from optimization_utils import capture_component_call, aoti_compile, cudagraph
 P = ParamSpec("P")
     "epilogue_fusion": False,
     "coordinate_descent_tuning": True,
     "coordinate_descent_check_all_directions": True,
+    # "max_autotune": True,
+    "max_autotune": False,
     "triton.cudagraphs": True,
 }
 TRANSFORMER_SPATIAL_PATCH_SIZE = 1
     latent_height = height // VAE_SPATIAL_COMPRESSION_RATIO
     latent_width = width // VAE_SPATIAL_COMPRESSION_RATIO
+    with capture_component_call(pipeline, "transformer") as call:
+        pipeline(*args, **kwargs)
     @spaces.GPU(duration=1500)
     def compile_transformer():
+        # dynamic_shapes = tree_map_only((torch.Tensor, bool), lambda t: None, call.kwargs)
         # dynamic_shapes |= TRANSFORMER_DYNAMIC_SHAPES
+        quantize_(pipeline.transformer, float8_dynamic_activation_float8_weight())
         hidden_states: torch.Tensor = call.kwargs["hidden_states"]
         unpacked_hidden_states = LTXConditionPipeline._unpack_latents(
             mod=pipeline.transformer,
             args=call.args,
             kwargs=call.kwargs | {"hidden_states": hidden_states_landscape},
+            # dynamic_shapes=dynamic_shapes,
         )
         exported_portrait = torch.export.export(
             mod=pipeline.transformer,
             args=call.args,
             kwargs=call.kwargs | {"hidden_states": hidden_states_portrait},
+            # dynamic_shapes=dynamic_shapes,
         )
         compiled_landscape = aoti_compile(exported_landscape, INDUCTOR_CONFIGS)
     compiled_landscape, compiled_portrait = compile_transformer()
+    @torch.no_grad()
     def combined_transformer(*args, **kwargs):
         hidden_states: torch.Tensor = kwargs["hidden_states"]
         unpacked_hidden_states = LTXConditionPipeline._unpack_latents(
     transformer_config = pipeline.transformer.config
     transformer_dtype = pipeline.transformer.dtype
     cache_context = pipeline.transformer.cache_context
+    with torch.no_grad():
+        combined_transformer(**call.kwargs)
+    pipeline.transformer = cudagraph(combined_transformer)
+    with torch.no_grad():
+        pipeline.transformer(**call.kwargs)
     pipeline.transformer.config = transformer_config  # pyright: ignore[reportAttributeAccessIssue]
     pipeline.transformer.dtype = transformer_dtype  # pyright: ignore[reportAttributeAccessIssue]
     pipeline.transformer.cache_context = cache_context

optimization_utils.py CHANGED Viewed

@@ -5,12 +5,13 @@ Taken from https://huggingface.co/spaces/cbensimon/wan2-1-fast/
 import contextlib
 from contextvars import ContextVar
 from io import BytesIO
-from typing import Any
-from typing import cast
 from unittest.mock import patch
 import torch
 from torch._inductor.package.package import package_aoti
 from torch.export.pt2_archive._package import AOTICompiledModel
 from torch.export.pt2_archive._package_weights import Weights
@@ -45,7 +46,9 @@ class ZeroGPUCompiledModel:
     def __call__(self, *args, **kwargs):
         if (compiled_model := self.compiled_model.get()) is None:
-            compiled_model = cast(AOTICompiledModel, torch._inductor.aoti_load_package(self.archive_file))
             compiled_model.load_constants(self.weights.constants_map, check_full_update=True, user_managed=True)
             self.compiled_model.set(compiled_model)
         return compiled_model(*args, **kwargs)
@@ -67,7 +70,7 @@ def aoti_compile(
     files: list[str | Weights] = [file for file in artifacts if isinstance(file, str)]
     package_aoti(archive_file, files)
     (weights,) = (artifact for artifact in artifacts if isinstance(artifact, Weights))
-    zerogpu_weights = ZeroGPUWeights({name: weights.get_weight(name)[0] for name in weights})
     return ZeroGPUCompiledModel(archive_file, zerogpu_weights)
@@ -100,3 +103,39 @@ def capture_component_call(
         except CapturedCallException as e:
             captured_call.args = e.args
             captured_call.kwargs = e.kwargs

 import contextlib
 from contextvars import ContextVar
 from io import BytesIO
+from typing import Any, cast
 from unittest.mock import patch
 import torch
+from torch.utils._pytree import tree_map_only
 from torch._inductor.package.package import package_aoti
+from torch._inductor.package import load_package
 from torch.export.pt2_archive._package import AOTICompiledModel
 from torch.export.pt2_archive._package_weights import Weights
     def __call__(self, *args, **kwargs):
         if (compiled_model := self.compiled_model.get()) is None:
+            # compiled_model = cast(AOTICompiledModel, torch._inductor.aoti_load_package(self.archive_file))
+            # compiled_model = torch._inductor.aoti_load_package(self.archive_file, run_single_threaded=True)
+            compiled_model = load_package(self.archive_file, run_single_threaded=True)
             compiled_model.load_constants(self.weights.constants_map, check_full_update=True, user_managed=True)
             self.compiled_model.set(compiled_model)
         return compiled_model(*args, **kwargs)
     files: list[str | Weights] = [file for file in artifacts if isinstance(file, str)]
     package_aoti(archive_file, files)
     (weights,) = (artifact for artifact in artifacts if isinstance(artifact, Weights))
+    zerogpu_weights = ZeroGPUWeights({name: weights.get_weight(name)[0] for name in weights}, to_cuda=True)
     return ZeroGPUCompiledModel(archive_file, zerogpu_weights)
         except CapturedCallException as e:
             captured_call.args = e.args
             captured_call.kwargs = e.kwargs
+# Taken from
+# https://github.com/huggingface/flux-fast/blob/5027798d7f69a8e0e478df92f48663c40727f8ea/utils/pipeline_utils.py#L198C1-L231C14
+def cudagraph(f):
+    _graphs = {}
+    def f_(*args, **kwargs):
+        key = hash(tuple(tuple(kwargs[a].shape) for a in sorted(kwargs.keys())
+                         if isinstance(kwargs[a], torch.Tensor)))
+        if key in _graphs:
+            # use the cached wrapper if one exists. this will perform CUDAGraph replay
+            wrapped, *_ = _graphs[key]
+            return wrapped(*args, **kwargs)
+        # record a new CUDAGraph and cache it for future use
+        g = torch.cuda.CUDAGraph()
+        in_args, in_kwargs = tree_map_only(torch.Tensor, lambda t: t.clone(), (args, kwargs))
+        f(*in_args, **in_kwargs) # stream warmup
+        with torch.cuda.graph(g):
+            out_tensors = f(*in_args, **in_kwargs)
+        def wrapped(*args, **kwargs):
+            # note that CUDAGraphs require inputs / outputs to be in fixed memory locations.
+            # inputs must be copied into the fixed input memory locations.
+            [a.copy_(b) for a, b in zip(in_args, args) if isinstance(a, torch.Tensor)]
+            for key in kwargs:
+                if isinstance(kwargs[key], torch.Tensor):
+                    in_kwargs[key].copy_(kwargs[key])
+            g.replay()
+            # clone() outputs on the way out to disconnect them from the fixed output memory
+            # locations. this allows for CUDAGraph reuse without accidentally overwriting memory
+            return [o.clone() for o in out_tensors]
+        # cache function that does CUDAGraph replay
+        _graphs[key] = (wrapped, g, in_args, in_kwargs, out_tensors)
+        return wrapped(*args, **kwargs)
+    return f_

reproduce.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from diffusers import LTXConditionPipeline
+from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
+from diffusers.utils import load_video, export_to_video
+from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight
+from io import BytesIO
+import contextlib
+from typing import Any, cast
+from unittest.mock import patch
+import torch
+from torch._inductor.package.package import package_aoti
+from torch._inductor.package import load_package
+from PIL import Image
+MODEL_ID = "Lightricks/LTX-Video-0.9.8-13B-distilled"
+LANDSCAPE_WIDTH = 480
+LANDSCAPE_HEIGHT = 832
+MAX_FRAMES_MODEL = 96
+INDUCTOR_CONFIGS = {
+    "conv_1x1_as_mm": True,
+    "epilogue_fusion": False,
+    "coordinate_descent_tuning": True,
+    "coordinate_descent_check_all_directions": True,
+    "max_autotune": False,
+    "triton.cudagraphs": True,
+}
+INDUCTOR_CONFIGS_OVERRIDES = {
+    "aot_inductor.package_constants_in_so": False,
+    "aot_inductor.package_constants_on_disk": True,
+    "aot_inductor.package": True,
+}
+@contextlib.contextmanager
+def capture_component_call(
+    pipeline: LTXConditionPipeline,
+    component_name: str,
+    component_method="forward",
+):
+    class CapturedCallException(Exception):
+        def __init__(self, *args, **kwargs):
+            super().__init__()
+            self.args = args
+            self.kwargs = kwargs
+    class CapturedCall:
+        def __init__(self):
+            self.args: tuple[Any, ...] = ()
+            self.kwargs: dict[str, Any] = {}
+    component = getattr(pipeline, component_name)
+    captured_call = CapturedCall()
+    def capture_call(*args, **kwargs):
+        raise CapturedCallException(*args, **kwargs)
+    with patch.object(component, component_method, new=capture_call):
+        try:
+            yield captured_call
+        except CapturedCallException as e:
+            captured_call.args = e.args
+            captured_call.kwargs = e.kwargs
+pipe = LTXConditionPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda")
+quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
+resized_image = Image.new("RGB", (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT))
+video = load_video(export_to_video([resized_image]))
+condition1 = LTXVideoCondition(video=video, frame_index=0)
+with capture_component_call(pipe, "transformer") as call:
+    pipe(
+        conditions=[condition1],
+        prompt="prompt",
+        height=LANDSCAPE_HEIGHT,
+        width=LANDSCAPE_WIDTH,
+        num_frames=MAX_FRAMES_MODEL,
+        num_inference_steps=2
+    )
+hidden_states: torch.Tensor = call.kwargs["hidden_states"]
+exported = torch.export.export(
+    mod=pipe.transformer,
+    args=call.args,
+    kwargs=call.kwargs | {"hidden_states": hidden_states},
+)
+assert exported.example_inputs is not None
+args, kwargs = exported.example_inputs
+gm = cast(torch.fx.GraphModule, exported.module())
+artifacts = torch._inductor.aot_compile(
+    gm, args, kwargs, options=INDUCTOR_CONFIGS | INDUCTOR_CONFIGS_OVERRIDES
+)
+archive_file = BytesIO()
+files = [file for file in artifacts if isinstance(file, str)]
+package_aoti(archive_file, files)
+compiled_model = load_package(archive_file, run_single_threaded=True)
+print("Package loaded.")
+transformer_config = pipe.transformer.config
+transformer_dtype = pipe.transformer.dtype
+cache_context = pipe.transformer.cache_context
+pipe.transformer = compiled_model
+pipe.transformer.config = transformer_config
+pipe.transformer.dtype = transformer_dtype
+pipe.transformer.cache_context = cache_context
+print("Configs done.")
+pipe(
+    conditions=[condition1],
+    prompt="prompt",
+    height=LANDSCAPE_HEIGHT,
+    width=LANDSCAPE_WIDTH,
+    num_frames=MAX_FRAMES_MODEL,
+    num_inference_steps=2
+)
+print("Okay")

requirements.txt DELETED Viewed

@@ -1,11 +0,0 @@
-git+https://github.com/huggingface/diffusers.git
-transformers
-accelerate
-safetensors
-sentencepiece
-peft
-ftfy
-imageio
-imageio-ffmpeg
-opencv-python
-torchao==0.11.0