Spaces:
Running
on
Zero
Running
on
Zero
from diffusers import LTXConditionPipeline | |
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition | |
import numpy as np | |
from PIL import Image | |
import torch | |
from diffusers.utils import load_image, load_video, export_to_video | |
from optimization import optimize_pipeline_ | |
MODEL_ID = "Lightricks/LTX-Video-0.9.8-13B-distilled" | |
LANDSCAPE_WIDTH = 480 | |
LANDSCAPE_HEIGHT = 832 | |
MAX_SEED = np.iinfo(np.int32).max | |
FIXED_FPS = 24 | |
MIN_FRAMES_MODEL = 8 | |
MAX_FRAMES_MODEL = 96 | |
MIN_DURATION = round(MIN_FRAMES_MODEL / FIXED_FPS, 1) | |
MAX_DURATION = round(MAX_FRAMES_MODEL / FIXED_FPS, 1) | |
def resize_image(image: Image.Image) -> Image.Image: | |
if image.height > image.width: | |
transposed = image.transpose(Image.Transpose.ROTATE_90) | |
resized = resize_image_landscape(transposed) | |
return resized.transpose(Image.Transpose.ROTATE_270) | |
return resize_image_landscape(image) | |
def resize_image_landscape(image: Image.Image) -> Image.Image: | |
target_aspect = LANDSCAPE_WIDTH / LANDSCAPE_HEIGHT | |
width, height = image.size | |
in_aspect = width / height | |
if in_aspect > target_aspect: | |
new_width = round(height * target_aspect) | |
left = (width - new_width) // 2 | |
image = image.crop((left, 0, left + new_width, height)) | |
else: | |
new_height = round(width / target_aspect) | |
top = (height - new_height) // 2 | |
image = image.crop((0, top, width, top + new_height)) | |
return image.resize((LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT), Image.LANCZOS) | |
pipe = LTXConditionPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda") | |
dummy_image = Image.new("RGB", (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT)) | |
video = load_video(export_to_video([dummy_image])) | |
condition1 = LTXVideoCondition(video=video, frame_index=0) | |
optimize_pipeline_( | |
pipe, | |
conditions=[condition1], | |
prompt="prompt", | |
negative_prompt="prompt", | |
height=LANDSCAPE_HEIGHT, | |
width=LANDSCAPE_WIDTH, | |
num_frames=MAX_FRAMES_MODEL, | |
num_inference_steps=2, | |
guidance_scale=1.0, | |
) | |
default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation" | |
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature" | |
input_image = load_image("peng.png") | |
duration_seconds = MAX_DURATION | |
guidance_scale = 1.0 | |
num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL) | |
current_seed = 42 | |
resized_image = resize_image(input_image) | |
steps = 8 | |
video = load_video(export_to_video([resized_image])) | |
condition1 = LTXVideoCondition(video=video, frame_index=0) | |
output_frames_list = pipe( | |
conditions=[condition1], | |
prompt=default_prompt_i2v, | |
negative_prompt=default_negative_prompt, | |
height=resized_image.height, | |
width=resized_image.width, | |
num_frames=num_frames, | |
guidance_scale=float(guidance_scale), | |
num_inference_steps=int(steps), | |
generator=torch.Generator(device="cuda").manual_seed(current_seed), | |
).frames[0] | |
export_to_video(output_frames_list, "output_original.mp4", fps=24) |