Spaces:
Running
on
Zero
Running
on
Zero
Fix `improve_texture` and hide video-to-video
Browse files
app.py
CHANGED
@@ -131,7 +131,7 @@ def calculate_new_dimensions(orig_w, orig_h):
|
|
131 |
|
132 |
def get_duration(prompt, negative_prompt, input_image_filepath, input_video_filepath,
|
133 |
height_ui, width_ui, mode,
|
134 |
-
|
135 |
ui_frames_to_use,
|
136 |
seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
|
137 |
progress):
|
@@ -143,7 +143,7 @@ def get_duration(prompt, negative_prompt, input_image_filepath, input_video_file
|
|
143 |
@spaces.GPU(duration=get_duration)
|
144 |
def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
|
145 |
height_ui, width_ui, mode,
|
146 |
-
|
147 |
ui_frames_to_use,
|
148 |
seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
|
149 |
progress=gr.Progress(track_tqdm=True)):
|
@@ -245,12 +245,15 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
|
|
245 |
multi_scale_pipeline_obj = LTXMultiScalePipeline(pipeline_instance, active_latent_upsampler)
|
246 |
|
247 |
first_pass_args = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
|
248 |
-
first_pass_args["guidance_scale"] = float(ui_guidance_scale)
|
249 |
-
|
250 |
-
|
|
|
251 |
|
252 |
second_pass_args = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
|
253 |
-
second_pass_args["guidance_scale"] = float(ui_guidance_scale)
|
|
|
|
|
254 |
|
255 |
multi_scale_call_kwargs = call_kwargs.copy()
|
256 |
multi_scale_call_kwargs.update({
|
@@ -263,8 +266,16 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
|
|
263 |
result_images_tensor = multi_scale_pipeline_obj(**multi_scale_call_kwargs).images
|
264 |
else:
|
265 |
single_pass_call_kwargs = call_kwargs.copy()
|
266 |
-
|
267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
single_pass_call_kwargs.pop("first_pass", None)
|
269 |
single_pass_call_kwargs.pop("second_pass", None)
|
270 |
single_pass_call_kwargs.pop("downscale_factor", None)
|
@@ -335,7 +346,7 @@ with gr.Blocks(css=css) as demo:
|
|
335 |
video_n_hidden = gr.Textbox(label="video_n", visible=False, value=None)
|
336 |
t2v_prompt = gr.Textbox(label="Prompt", value="A majestic dragon flying over a medieval castle", lines=3)
|
337 |
t2v_button = gr.Button("Generate Text-to-Video", variant="primary")
|
338 |
-
with gr.Tab("video-to-video") as video_tab:
|
339 |
image_v_hidden = gr.Textbox(label="image_v", visible=False, value=None)
|
340 |
video_v2v = gr.Video(label="Input Video", sources=["upload", "webcam"]) # type defaults to filepath
|
341 |
frames_to_use = gr.Slider(label="Frames to use from input video", minimum=9, maximum=MAX_NUM_FRAMES, value=9, step=8, info="Number of initial frames to use for conditioning/transformation. Must be N*8+1.")
|
@@ -363,8 +374,9 @@ with gr.Blocks(css=css) as demo:
|
|
363 |
randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=False)
|
364 |
with gr.Row():
|
365 |
guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
|
366 |
-
|
367 |
-
|
|
|
368 |
with gr.Row():
|
369 |
height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
|
370 |
width_input = gr.Slider(label="Width", value=704, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
|
@@ -436,17 +448,17 @@ with gr.Blocks(css=css) as demo:
|
|
436 |
# --- INPUT LISTS (remain the same structurally) ---
|
437 |
t2v_inputs = [t2v_prompt, negative_prompt_input, image_n_hidden, video_n_hidden,
|
438 |
height_input, width_input, gr.State("text-to-video"),
|
439 |
-
|
440 |
seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
|
441 |
|
442 |
i2v_inputs = [i2v_prompt, negative_prompt_input, image_i2v, video_i_hidden,
|
443 |
height_input, width_input, gr.State("image-to-video"),
|
444 |
-
|
445 |
seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
|
446 |
|
447 |
v2v_inputs = [v2v_prompt, negative_prompt_input, image_v_hidden, video_v2v,
|
448 |
height_input, width_input, gr.State("video-to-video"),
|
449 |
-
|
450 |
seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
|
451 |
|
452 |
t2v_button.click(fn=generate, inputs=t2v_inputs, outputs=[output_video], api_name="text_to_video")
|
|
|
131 |
|
132 |
def get_duration(prompt, negative_prompt, input_image_filepath, input_video_filepath,
|
133 |
height_ui, width_ui, mode,
|
134 |
+
duration_ui, # Removed ui_steps
|
135 |
ui_frames_to_use,
|
136 |
seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
|
137 |
progress):
|
|
|
143 |
@spaces.GPU(duration=get_duration)
|
144 |
def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
|
145 |
height_ui, width_ui, mode,
|
146 |
+
duration_ui, # Removed ui_steps
|
147 |
ui_frames_to_use,
|
148 |
seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
|
149 |
progress=gr.Progress(track_tqdm=True)):
|
|
|
245 |
multi_scale_pipeline_obj = LTXMultiScalePipeline(pipeline_instance, active_latent_upsampler)
|
246 |
|
247 |
first_pass_args = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
|
248 |
+
first_pass_args["guidance_scale"] = float(ui_guidance_scale) # UI overrides YAML
|
249 |
+
# num_inference_steps will be derived from len(timesteps) in the pipeline
|
250 |
+
first_pass_args.pop("num_inference_steps", None)
|
251 |
+
|
252 |
|
253 |
second_pass_args = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
|
254 |
+
second_pass_args["guidance_scale"] = float(ui_guidance_scale) # UI overrides YAML
|
255 |
+
# num_inference_steps will be derived from len(timesteps) in the pipeline
|
256 |
+
second_pass_args.pop("num_inference_steps", None)
|
257 |
|
258 |
multi_scale_call_kwargs = call_kwargs.copy()
|
259 |
multi_scale_call_kwargs.update({
|
|
|
266 |
result_images_tensor = multi_scale_pipeline_obj(**multi_scale_call_kwargs).images
|
267 |
else:
|
268 |
single_pass_call_kwargs = call_kwargs.copy()
|
269 |
+
first_pass_config_from_yaml = PIPELINE_CONFIG_YAML.get("first_pass", {})
|
270 |
+
|
271 |
+
single_pass_call_kwargs["timesteps"] = first_pass_config_from_yaml.get("timesteps")
|
272 |
+
single_pass_call_kwargs["guidance_scale"] = float(ui_guidance_scale) # UI overrides YAML
|
273 |
+
single_pass_call_kwargs["stg_scale"] = first_pass_config_from_yaml.get("stg_scale")
|
274 |
+
single_pass_call_kwargs["rescaling_scale"] = first_pass_config_from_yaml.get("rescaling_scale")
|
275 |
+
single_pass_call_kwargs["skip_block_list"] = first_pass_config_from_yaml.get("skip_block_list")
|
276 |
+
|
277 |
+
# Remove keys that might conflict or are not used in single pass / handled by above
|
278 |
+
single_pass_call_kwargs.pop("num_inference_steps", None)
|
279 |
single_pass_call_kwargs.pop("first_pass", None)
|
280 |
single_pass_call_kwargs.pop("second_pass", None)
|
281 |
single_pass_call_kwargs.pop("downscale_factor", None)
|
|
|
346 |
video_n_hidden = gr.Textbox(label="video_n", visible=False, value=None)
|
347 |
t2v_prompt = gr.Textbox(label="Prompt", value="A majestic dragon flying over a medieval castle", lines=3)
|
348 |
t2v_button = gr.Button("Generate Text-to-Video", variant="primary")
|
349 |
+
with gr.Tab("video-to-video", visible=False) as video_tab:
|
350 |
image_v_hidden = gr.Textbox(label="image_v", visible=False, value=None)
|
351 |
video_v2v = gr.Video(label="Input Video", sources=["upload", "webcam"]) # type defaults to filepath
|
352 |
frames_to_use = gr.Slider(label="Frames to use from input video", minimum=9, maximum=MAX_NUM_FRAMES, value=9, step=8, info="Number of initial frames to use for conditioning/transformation. Must be N*8+1.")
|
|
|
374 |
randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=False)
|
375 |
with gr.Row():
|
376 |
guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
|
377 |
+
# Removed steps_input slider
|
378 |
+
# default_steps = len(PIPELINE_CONFIG_YAML.get("first_pass", {}).get("timesteps", [1]*7))
|
379 |
+
# steps_input = gr.Slider(label="Inference Steps (for first pass if multi-scale)", minimum=1, maximum=30, value=default_steps, step=1, info="Number of denoising steps. More steps can improve quality but increase time. If YAML defines 'timesteps' for a pass, this UI value is ignored for that pass.")
|
380 |
with gr.Row():
|
381 |
height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
|
382 |
width_input = gr.Slider(label="Width", value=704, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
|
|
|
448 |
# --- INPUT LISTS (remain the same structurally) ---
|
449 |
t2v_inputs = [t2v_prompt, negative_prompt_input, image_n_hidden, video_n_hidden,
|
450 |
height_input, width_input, gr.State("text-to-video"),
|
451 |
+
duration_input, gr.State(0), # Removed steps_input
|
452 |
seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
|
453 |
|
454 |
i2v_inputs = [i2v_prompt, negative_prompt_input, image_i2v, video_i_hidden,
|
455 |
height_input, width_input, gr.State("image-to-video"),
|
456 |
+
duration_input, gr.State(0), # Removed steps_input
|
457 |
seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
|
458 |
|
459 |
v2v_inputs = [v2v_prompt, negative_prompt_input, image_v_hidden, video_v2v,
|
460 |
height_input, width_input, gr.State("video-to-video"),
|
461 |
+
duration_input, frames_to_use, # Removed steps_input
|
462 |
seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
|
463 |
|
464 |
t2v_button.click(fn=generate, inputs=t2v_inputs, outputs=[output_video], api_name="text_to_video")
|