Spaces:

zerogpu-aoti
/

ltx-dev-fast

Running on Zero

File size: 3,391 Bytes

from diffusers import LTXConditionPipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
import numpy as np
from PIL import Image
import torch
from diffusers.utils import load_image, load_video, export_to_video
from optimization import optimize_pipeline_

MODEL_ID = "Lightricks/LTX-Video-0.9.8-13B-distilled"

LANDSCAPE_WIDTH = 480
LANDSCAPE_HEIGHT = 832
MAX_SEED = np.iinfo(np.int32).max

FIXED_FPS = 24
MIN_FRAMES_MODEL = 8
MAX_FRAMES_MODEL = 96

MIN_DURATION = round(MIN_FRAMES_MODEL / FIXED_FPS, 1)
MAX_DURATION = round(MAX_FRAMES_MODEL / FIXED_FPS, 1)

def resize_image(image: Image.Image) -> Image.Image:
    if image.height > image.width:
        transposed = image.transpose(Image.Transpose.ROTATE_90)
        resized = resize_image_landscape(transposed)
        return resized.transpose(Image.Transpose.ROTATE_270)
    return resize_image_landscape(image)


def resize_image_landscape(image: Image.Image) -> Image.Image:
    target_aspect = LANDSCAPE_WIDTH / LANDSCAPE_HEIGHT
    width, height = image.size
    in_aspect = width / height
    if in_aspect > target_aspect:
        new_width = round(height * target_aspect)
        left = (width - new_width) // 2
        image = image.crop((left, 0, left + new_width, height))
    else:
        new_height = round(width / target_aspect)
        top = (height - new_height) // 2
        image = image.crop((0, top, width, top + new_height))
    return image.resize((LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT), Image.LANCZOS)


pipe = LTXConditionPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda")
dummy_image = Image.new("RGB", (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT))
video = load_video(export_to_video([dummy_image]))
condition1 = LTXVideoCondition(video=video, frame_index=0)
optimize_pipeline_(
    pipe,
    conditions=[condition1],
    prompt="prompt",
    negative_prompt="prompt",
    height=LANDSCAPE_HEIGHT,
    width=LANDSCAPE_WIDTH,
    num_frames=MAX_FRAMES_MODEL,
    num_inference_steps=2,
    guidance_scale=1.0,
)

default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
input_image = load_image("peng.png")
duration_seconds = MAX_DURATION
guidance_scale = 1.0
num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
current_seed = 42
resized_image = resize_image(input_image)
steps = 8

video = load_video(export_to_video([resized_image]))
condition1 = LTXVideoCondition(video=video, frame_index=0)

output_frames_list = pipe(
    conditions=[condition1],
    prompt=default_prompt_i2v,
    negative_prompt=default_negative_prompt,
    height=resized_image.height,
    width=resized_image.width,
    num_frames=num_frames,
    guidance_scale=float(guidance_scale),
    num_inference_steps=int(steps),
    generator=torch.Generator(device="cuda").manual_seed(current_seed),
).frames[0]
export_to_video(output_frames_list, "output_original.mp4", fps=24)