ltx-video-distilled

Running on Zero

App Files Files Community

multimodalart HF Staff commited on May 14

Commit

4d68dfd

verified ·

1 Parent(s): 6ad4062

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -29

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ import tempfile
 from PIL import Image
 from huggingface_hub import hf_hub_download
 import shutil
-import math # For math.round, though built-in round works for floats
 from inference import (
     create_ltx_video_pipeline,
@@ -89,13 +88,56 @@ if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
 target_inference_device = "cuda"
 print(f"Target inference device: {target_inference_device}")
 pipeline_instance.to(target_inference_device)
-if latent_upsampler_instance: # Check if it was created before moving
     latent_upsampler_instance.to(target_inference_device)
 @spaces.GPU
 def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
              height_ui, width_ui, mode,
-             ui_steps, duration_ui, # << CHANGED from num_frames_ui
              ui_frames_to_use,
              seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
              progress=gr.Progress(track_tqdm=True)):
@@ -104,33 +146,25 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
         seed_ui = random.randint(0, 2**32 - 1)
     seed_everething(int(seed_ui))
-    # Convert duration_ui (seconds) to actual_num_frames (N*8+1 format)
     target_frames_ideal = duration_ui * FPS
     target_frames_rounded = round(target_frames_ideal)
-    if target_frames_rounded < 1: # ensure positive for calculation
         target_frames_rounded = 1
-    # Calculate N for N*8+1, ensuring it's rounded to the nearest integer
-    # (target_frames_rounded - 1) could be float if target_frames_rounded is float
     n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
     actual_num_frames = int(n_val * 8 + 1)
-    # Clamp to the allowed min (9) and max (MAX_NUM_FRAMES) N*8+1 values
     actual_num_frames = max(9, actual_num_frames)
     actual_num_frames = min(MAX_NUM_FRAMES, actual_num_frames)
     actual_height = int(height_ui)
     actual_width = int(width_ui)
-    # actual_num_frames is now calculated above
     height_padded = ((actual_height - 1) // 32 + 1) * 32
     width_padded = ((actual_width - 1) // 32 + 1) * 32
-    # This padding ensures the model gets a frame count that is N*8+1
-    # Since actual_num_frames is already N*8+1, this should preserve it.
     num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
     if num_frames_padded != actual_num_frames:
         print(f"Warning: actual_num_frames ({actual_num_frames}) and num_frames_padded ({num_frames_padded}) differ. Using num_frames_padded for pipeline.")
-        # This case should ideally not happen if actual_num_frames is correctly N*8+1 and >= 9.
     padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
@@ -139,7 +173,7 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
         "negative_prompt": negative_prompt,
         "height": height_padded,
         "width": width_padded,
-        "num_frames": num_frames_padded, # Use the padded value for the model
         "frame_rate": int(FPS),
         "generator": torch.Generator(device=target_inference_device).manual_seed(int(seed_ui)),
         "output_type": "pt",
@@ -184,7 +218,7 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
                 media_path=input_video_filepath,
                 height=actual_height,
                 width=actual_width,
-                max_frames=int(ui_frames_to_use), # This is from a separate slider for V2V
                 padding=padding_values
             ).to(target_inference_device)
         except Exception as e:
@@ -192,15 +226,10 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
             raise gr.Error(f"Could not load video: {e}")
     print(f"Moving models to {target_inference_device} for inference (if not already there)...")
-    # Models are moved globally once, no need to move per call unless strategy changes.
-    # pipeline_instance.to(target_inference_device)
-    # if latent_upsampler_instance:
-    #    latent_upsampler_instance.to(target_inference_device)
     active_latent_upsampler = None
     if improve_texture_flag and latent_upsampler_instance:
         active_latent_upsampler = latent_upsampler_instance
-    #print("Models moved.")
     result_images_tensor = None
     if improve_texture_flag:
@@ -230,7 +259,6 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
         single_pass_call_kwargs = call_kwargs.copy()
         single_pass_call_kwargs["guidance_scale"] = float(ui_guidance_scale)
         single_pass_call_kwargs["num_inference_steps"] = int(ui_steps)
-        # These keys might not exist if improve_texture_flag is false from the start of call_kwargs
         single_pass_call_kwargs.pop("first_pass", None)
         single_pass_call_kwargs.pop("second_pass", None)
         single_pass_call_kwargs.pop("downscale_factor", None)
@@ -245,7 +273,6 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
     slice_h_end = -pad_bottom if pad_bottom > 0 else None
     slice_w_end = -pad_right if pad_right > 0 else None
-    # Crop to actual_num_frames, which is the desired output length
     result_images_tensor = result_images_tensor[
         :, :, :actual_num_frames, pad_top:slice_h_end, pad_left:slice_w_end
     ]
@@ -297,6 +324,7 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
     return output_video_path
 # --- Gradio UI Definition ---
 css="""
 #col-container {
@@ -308,6 +336,7 @@ css="""
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# LTX Video 0.9.7 Distilled")
     gr.Markdown("Fast high quality video generation. [Model](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-2b-0.9.6-distilled-04-25.safetensors) [GitHub](https://github.com/Lightricks/LTX-Video) [Diffusers](#)")
     with gr.Row():
         with gr.Column():
             with gr.Tab("image-to-video") as image_tab:
@@ -322,7 +351,7 @@ with gr.Blocks(css=css) as demo:
                 t2v_button = gr.Button("Generate Text-to-Video", variant="primary")
             with gr.Tab("video-to-video") as video_tab:
                 image_v_hidden = gr.Textbox(label="image_v", visible=False, value=None)
-                video_v2v = gr.Video(label="Input Video", sources=["upload", "webcam"])
                 frames_to_use = gr.Slider(label="Frames to use from input video", minimum=9, maximum=MAX_NUM_FRAMES, value=9, step=8, info="Number of initial frames to use for conditioning/transformation. Must be N*8+1.")
                 v2v_prompt = gr.Textbox(label="Prompt", value="Change the style to cinematic anime", lines=3)
                 v2v_button = gr.Button("Generate Video-to-Video", variant="primary")
@@ -347,26 +376,90 @@ with gr.Blocks(css=css) as demo:
             randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=False)
         with gr.Row():
             guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
-            default_steps = len(PIPELINE_CONFIG_YAML.get("first_pass", {}).get("timesteps", [1]*7)) # Default to 7 if not found
             steps_input = gr.Slider(label="Inference Steps (for first pass if multi-scale)", minimum=1, maximum=30, value=default_steps, step=1, info="Number of denoising steps. More steps can improve quality but increase time. If YAML defines 'timesteps' for a pass, this UI value is ignored for that pass.")
         with gr.Row():
-            height_input = gr.Slider(label="Height", value=512, step=32, minimum=256, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
-            width_input = gr.Slider(label="Width", value=704, step=32, minimum=256, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
-    # --- UPDATED INPUT LISTS ---
     t2v_inputs = [t2v_prompt, negative_prompt_input, image_n_hidden, video_n_hidden,
                   height_input, width_input, gr.State("text-to-video"),
-                  steps_input, duration_input, gr.State(0), # Replaced num_frames_input with duration_input
                   seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
     i2v_inputs = [i2v_prompt, negative_prompt_input, image_i2v, video_i_hidden,
                   height_input, width_input, gr.State("image-to-video"),
-                  steps_input, duration_input, gr.State(0), # Replaced num_frames_input with duration_input
                   seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
     v2v_inputs = [v2v_prompt, negative_prompt_input, image_v_hidden, video_v2v,
                   height_input, width_input, gr.State("video-to-video"),
-                  steps_input, duration_input, frames_to_use, # Replaced num_frames_input with duration_input
                   seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
     t2v_button.click(fn=generate, inputs=t2v_inputs, outputs=[output_video], api_name="text_to_video")

 from PIL import Image
 from huggingface_hub import hf_hub_download
 import shutil
 from inference import (
     create_ltx_video_pipeline,
 target_inference_device = "cuda"
 print(f"Target inference device: {target_inference_device}")
 pipeline_instance.to(target_inference_device)
+if latent_upsampler_instance:
     latent_upsampler_instance.to(target_inference_device)
+# --- Helper function for dimension calculation ---
+MIN_DIM_SLIDER = 256  # As defined in the sliders minimum attribute
+TARGET_FIXED_SIDE = 512 # Desired fixed side length as per requirement
+def calculate_new_dimensions(orig_w, orig_h):
+    """
+    Calculates new dimensions for height and width sliders based on original media dimensions.
+    Ensures one side is TARGET_FIXED_SIDE, the other is scaled proportionally,
+    both are multiples of 32, and within [MIN_DIM_SLIDER, MAX_IMAGE_SIZE].
+    """
+    if orig_w == 0 or orig_h == 0:
+        # Default to TARGET_FIXED_SIDE square if original dimensions are invalid
+        return int(TARGET_FIXED_SIDE), int(TARGET_FIXED_SIDE)
+    if orig_w >= orig_h:  # Landscape or square
+        new_h = TARGET_FIXED_SIDE
+        aspect_ratio = orig_w / orig_h
+        new_w_ideal = new_h * aspect_ratio
+        # Round to nearest multiple of 32
+        new_w = round(new_w_ideal / 32) * 32
+        # Clamp to [MIN_DIM_SLIDER, MAX_IMAGE_SIZE]
+        new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
+        # Ensure new_h is also clamped (TARGET_FIXED_SIDE should be within these bounds if configured correctly)
+        new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
+    else:  # Portrait
+        new_w = TARGET_FIXED_SIDE
+        aspect_ratio = orig_h / orig_w # Use H/W ratio for portrait scaling
+        new_h_ideal = new_w * aspect_ratio
+        # Round to nearest multiple of 32
+        new_h = round(new_h_ideal / 32) * 32
+        # Clamp to [MIN_DIM_SLIDER, MAX_IMAGE_SIZE]
+        new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
+        # Ensure new_w is also clamped
+        new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
+    return int(new_h), int(new_w)
 @spaces.GPU
 def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
              height_ui, width_ui, mode,
+             ui_steps, duration_ui,
              ui_frames_to_use,
              seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
              progress=gr.Progress(track_tqdm=True)):
         seed_ui = random.randint(0, 2**32 - 1)
     seed_everething(int(seed_ui))
     target_frames_ideal = duration_ui * FPS
     target_frames_rounded = round(target_frames_ideal)
+    if target_frames_rounded < 1:
         target_frames_rounded = 1
     n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
     actual_num_frames = int(n_val * 8 + 1)
     actual_num_frames = max(9, actual_num_frames)
     actual_num_frames = min(MAX_NUM_FRAMES, actual_num_frames)
     actual_height = int(height_ui)
     actual_width = int(width_ui)
     height_padded = ((actual_height - 1) // 32 + 1) * 32
     width_padded = ((actual_width - 1) // 32 + 1) * 32
     num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
     if num_frames_padded != actual_num_frames:
         print(f"Warning: actual_num_frames ({actual_num_frames}) and num_frames_padded ({num_frames_padded}) differ. Using num_frames_padded for pipeline.")
     padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
         "negative_prompt": negative_prompt,
         "height": height_padded,
         "width": width_padded,
+        "num_frames": num_frames_padded,
         "frame_rate": int(FPS),
         "generator": torch.Generator(device=target_inference_device).manual_seed(int(seed_ui)),
         "output_type": "pt",
                 media_path=input_video_filepath,
                 height=actual_height,
                 width=actual_width,
+                max_frames=int(ui_frames_to_use),
                 padding=padding_values
             ).to(target_inference_device)
         except Exception as e:
             raise gr.Error(f"Could not load video: {e}")
     print(f"Moving models to {target_inference_device} for inference (if not already there)...")
     active_latent_upsampler = None
     if improve_texture_flag and latent_upsampler_instance:
         active_latent_upsampler = latent_upsampler_instance
     result_images_tensor = None
     if improve_texture_flag:
         single_pass_call_kwargs = call_kwargs.copy()
         single_pass_call_kwargs["guidance_scale"] = float(ui_guidance_scale)
         single_pass_call_kwargs["num_inference_steps"] = int(ui_steps)
         single_pass_call_kwargs.pop("first_pass", None)
         single_pass_call_kwargs.pop("second_pass", None)
         single_pass_call_kwargs.pop("downscale_factor", None)
     slice_h_end = -pad_bottom if pad_bottom > 0 else None
     slice_w_end = -pad_right if pad_right > 0 else None
     result_images_tensor = result_images_tensor[
         :, :, :actual_num_frames, pad_top:slice_h_end, pad_left:slice_w_end
     ]
     return output_video_path
 # --- Gradio UI Definition ---
 css="""
 #col-container {
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# LTX Video 0.9.7 Distilled")
     gr.Markdown("Fast high quality video generation. [Model](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-2b-0.9.6-distilled-04-25.safetensors) [GitHub](https://github.com/Lightricks/LTX-Video) [Diffusers](#)")
     with gr.Row():
         with gr.Column():
             with gr.Tab("image-to-video") as image_tab:
                 t2v_button = gr.Button("Generate Text-to-Video", variant="primary")
             with gr.Tab("video-to-video") as video_tab:
                 image_v_hidden = gr.Textbox(label="image_v", visible=False, value=None)
+                video_v2v = gr.Video(label="Input Video", sources=["upload", "webcam"]) # type defaults to filepath
                 frames_to_use = gr.Slider(label="Frames to use from input video", minimum=9, maximum=MAX_NUM_FRAMES, value=9, step=8, info="Number of initial frames to use for conditioning/transformation. Must be N*8+1.")
                 v2v_prompt = gr.Textbox(label="Prompt", value="Change the style to cinematic anime", lines=3)
                 v2v_button = gr.Button("Generate Video-to-Video", variant="primary")
             randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=False)
         with gr.Row():
             guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
+            default_steps = len(PIPELINE_CONFIG_YAML.get("first_pass", {}).get("timesteps", [1]*7))
             steps_input = gr.Slider(label="Inference Steps (for first pass if multi-scale)", minimum=1, maximum=30, value=default_steps, step=1, info="Number of denoising steps. More steps can improve quality but increase time. If YAML defines 'timesteps' for a pass, this UI value is ignored for that pass.")
         with gr.Row():
+            height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
+            width_input = gr.Slider(label="Width", value=704, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
+    # --- Event handlers for updating dimensions on upload ---
+    def handle_image_upload_for_dims(image_filepath, current_h, current_w):
+        if not image_filepath:  # Image cleared or no image initially
+            # Keep current slider values if image is cleared or no input
+            return gr.update(value=current_h), gr.update(value=current_w)
+        try:
+            img = Image.open(image_filepath)
+            orig_w, orig_h = img.size
+            new_h, new_w = calculate_new_dimensions(orig_w, orig_h)
+            return gr.update(value=new_h), gr.update(value=new_w)
+        except Exception as e:
+            print(f"Error processing image for dimension update: {e}")
+            # Keep current slider values on error
+            return gr.update(value=current_h), gr.update(value=current_w)
+    def handle_video_upload_for_dims(video_filepath, current_h, current_w):
+        if not video_filepath:  # Video cleared or no video initially
+            return gr.update(value=current_h), gr.update(value=current_w)
+        try:
+            # Ensure video_filepath is a string for os.path.exists and imageio
+            video_filepath_str = str(video_filepath)
+            if not os.path.exists(video_filepath_str):
+                print(f"Video file path does not exist for dimension update: {video_filepath_str}")
+                return gr.update(value=current_h), gr.update(value=current_w)
+            orig_w, orig_h = -1, -1
+            with imageio.get_reader(video_filepath_str) as reader:
+                meta = reader.get_meta_data()
+                if 'size' in meta:
+                    orig_w, orig_h = meta['size']
+                else:
+                    # Fallback: read first frame if 'size' not in metadata
+                    try:
+                        first_frame = reader.get_data(0)
+                        # Shape is (h, w, c) for frames
+                        orig_h, orig_w = first_frame.shape[0], first_frame.shape[1]
+                    except Exception as e_frame:
+                        print(f"Could not get video size from metadata or first frame: {e_frame}")
+                        return gr.update(value=current_h), gr.update(value=current_w)
+            if orig_w == -1 or orig_h == -1: # If dimensions couldn't be determined
+                 print(f"Could not determine dimensions for video: {video_filepath_str}")
+                 return gr.update(value=current_h), gr.update(value=current_w)
+            new_h, new_w = calculate_new_dimensions(orig_w, orig_h)
+            return gr.update(value=new_h), gr.update(value=new_w)
+        except Exception as e:
+            # Log type of video_filepath for debugging if it's not a path-like string
+            print(f"Error processing video for dimension update: {e} (Path: {video_filepath}, Type: {type(video_filepath)})")
+            return gr.update(value=current_h), gr.update(value=current_w)
+    # Attach upload handlers
+    image_i2v.upload(
+        fn=handle_image_upload_for_dims,
+        inputs=[image_i2v, height_input, width_input],
+        outputs=[height_input, width_input]
+    )
+    video_v2v.upload(
+        fn=handle_video_upload_for_dims,
+        inputs=[video_v2v, height_input, width_input],
+        outputs=[height_input, width_input]
+    )
+    # --- INPUT LISTS (remain the same structurally) ---
     t2v_inputs = [t2v_prompt, negative_prompt_input, image_n_hidden, video_n_hidden,
                   height_input, width_input, gr.State("text-to-video"),
+                  steps_input, duration_input, gr.State(0),
                   seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
     i2v_inputs = [i2v_prompt, negative_prompt_input, image_i2v, video_i_hidden,
                   height_input, width_input, gr.State("image-to-video"),
+                  steps_input, duration_input, gr.State(0),
                   seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
     v2v_inputs = [v2v_prompt, negative_prompt_input, image_v_hidden, video_v2v,
                   height_input, width_input, gr.State("video-to-video"),
+                  steps_input, duration_input, frames_to_use,
                   seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
     t2v_button.click(fn=generate, inputs=t2v_inputs, outputs=[output_video], api_name="text_to_video")