multimodalart HF Staff commited on
Commit
b4e4e06
·
verified ·
1 Parent(s): 05ab75c

Fix `improve_texture` and hide video-to-video

Browse files
Files changed (1) hide show
  1. app.py +26 -14
app.py CHANGED
@@ -131,7 +131,7 @@ def calculate_new_dimensions(orig_w, orig_h):
131
 
132
  def get_duration(prompt, negative_prompt, input_image_filepath, input_video_filepath,
133
  height_ui, width_ui, mode,
134
- ui_steps, duration_ui,
135
  ui_frames_to_use,
136
  seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
137
  progress):
@@ -143,7 +143,7 @@ def get_duration(prompt, negative_prompt, input_image_filepath, input_video_file
143
  @spaces.GPU(duration=get_duration)
144
  def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
145
  height_ui, width_ui, mode,
146
- ui_steps, duration_ui,
147
  ui_frames_to_use,
148
  seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
149
  progress=gr.Progress(track_tqdm=True)):
@@ -245,12 +245,15 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
245
  multi_scale_pipeline_obj = LTXMultiScalePipeline(pipeline_instance, active_latent_upsampler)
246
 
247
  first_pass_args = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
248
- first_pass_args["guidance_scale"] = float(ui_guidance_scale)
249
- if "timesteps" not in first_pass_args:
250
- first_pass_args["num_inference_steps"] = int(ui_steps)
 
251
 
252
  second_pass_args = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
253
- second_pass_args["guidance_scale"] = float(ui_guidance_scale)
 
 
254
 
255
  multi_scale_call_kwargs = call_kwargs.copy()
256
  multi_scale_call_kwargs.update({
@@ -263,8 +266,16 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
263
  result_images_tensor = multi_scale_pipeline_obj(**multi_scale_call_kwargs).images
264
  else:
265
  single_pass_call_kwargs = call_kwargs.copy()
266
- single_pass_call_kwargs["guidance_scale"] = float(ui_guidance_scale)
267
- single_pass_call_kwargs["num_inference_steps"] = int(ui_steps)
 
 
 
 
 
 
 
 
268
  single_pass_call_kwargs.pop("first_pass", None)
269
  single_pass_call_kwargs.pop("second_pass", None)
270
  single_pass_call_kwargs.pop("downscale_factor", None)
@@ -335,7 +346,7 @@ with gr.Blocks(css=css) as demo:
335
  video_n_hidden = gr.Textbox(label="video_n", visible=False, value=None)
336
  t2v_prompt = gr.Textbox(label="Prompt", value="A majestic dragon flying over a medieval castle", lines=3)
337
  t2v_button = gr.Button("Generate Text-to-Video", variant="primary")
338
- with gr.Tab("video-to-video") as video_tab:
339
  image_v_hidden = gr.Textbox(label="image_v", visible=False, value=None)
340
  video_v2v = gr.Video(label="Input Video", sources=["upload", "webcam"]) # type defaults to filepath
341
  frames_to_use = gr.Slider(label="Frames to use from input video", minimum=9, maximum=MAX_NUM_FRAMES, value=9, step=8, info="Number of initial frames to use for conditioning/transformation. Must be N*8+1.")
@@ -363,8 +374,9 @@ with gr.Blocks(css=css) as demo:
363
  randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=False)
364
  with gr.Row():
365
  guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
366
- default_steps = len(PIPELINE_CONFIG_YAML.get("first_pass", {}).get("timesteps", [1]*7))
367
- steps_input = gr.Slider(label="Inference Steps (for first pass if multi-scale)", minimum=1, maximum=30, value=default_steps, step=1, info="Number of denoising steps. More steps can improve quality but increase time. If YAML defines 'timesteps' for a pass, this UI value is ignored for that pass.")
 
368
  with gr.Row():
369
  height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
370
  width_input = gr.Slider(label="Width", value=704, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
@@ -436,17 +448,17 @@ with gr.Blocks(css=css) as demo:
436
  # --- INPUT LISTS (remain the same structurally) ---
437
  t2v_inputs = [t2v_prompt, negative_prompt_input, image_n_hidden, video_n_hidden,
438
  height_input, width_input, gr.State("text-to-video"),
439
- steps_input, duration_input, gr.State(0),
440
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
441
 
442
  i2v_inputs = [i2v_prompt, negative_prompt_input, image_i2v, video_i_hidden,
443
  height_input, width_input, gr.State("image-to-video"),
444
- steps_input, duration_input, gr.State(0),
445
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
446
 
447
  v2v_inputs = [v2v_prompt, negative_prompt_input, image_v_hidden, video_v2v,
448
  height_input, width_input, gr.State("video-to-video"),
449
- steps_input, duration_input, frames_to_use,
450
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
451
 
452
  t2v_button.click(fn=generate, inputs=t2v_inputs, outputs=[output_video], api_name="text_to_video")
 
131
 
132
  def get_duration(prompt, negative_prompt, input_image_filepath, input_video_filepath,
133
  height_ui, width_ui, mode,
134
+ duration_ui, # Removed ui_steps
135
  ui_frames_to_use,
136
  seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
137
  progress):
 
143
  @spaces.GPU(duration=get_duration)
144
  def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
145
  height_ui, width_ui, mode,
146
+ duration_ui, # Removed ui_steps
147
  ui_frames_to_use,
148
  seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
149
  progress=gr.Progress(track_tqdm=True)):
 
245
  multi_scale_pipeline_obj = LTXMultiScalePipeline(pipeline_instance, active_latent_upsampler)
246
 
247
  first_pass_args = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
248
+ first_pass_args["guidance_scale"] = float(ui_guidance_scale) # UI overrides YAML
249
+ # num_inference_steps will be derived from len(timesteps) in the pipeline
250
+ first_pass_args.pop("num_inference_steps", None)
251
+
252
 
253
  second_pass_args = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
254
+ second_pass_args["guidance_scale"] = float(ui_guidance_scale) # UI overrides YAML
255
+ # num_inference_steps will be derived from len(timesteps) in the pipeline
256
+ second_pass_args.pop("num_inference_steps", None)
257
 
258
  multi_scale_call_kwargs = call_kwargs.copy()
259
  multi_scale_call_kwargs.update({
 
266
  result_images_tensor = multi_scale_pipeline_obj(**multi_scale_call_kwargs).images
267
  else:
268
  single_pass_call_kwargs = call_kwargs.copy()
269
+ first_pass_config_from_yaml = PIPELINE_CONFIG_YAML.get("first_pass", {})
270
+
271
+ single_pass_call_kwargs["timesteps"] = first_pass_config_from_yaml.get("timesteps")
272
+ single_pass_call_kwargs["guidance_scale"] = float(ui_guidance_scale) # UI overrides YAML
273
+ single_pass_call_kwargs["stg_scale"] = first_pass_config_from_yaml.get("stg_scale")
274
+ single_pass_call_kwargs["rescaling_scale"] = first_pass_config_from_yaml.get("rescaling_scale")
275
+ single_pass_call_kwargs["skip_block_list"] = first_pass_config_from_yaml.get("skip_block_list")
276
+
277
+ # Remove keys that might conflict or are not used in single pass / handled by above
278
+ single_pass_call_kwargs.pop("num_inference_steps", None)
279
  single_pass_call_kwargs.pop("first_pass", None)
280
  single_pass_call_kwargs.pop("second_pass", None)
281
  single_pass_call_kwargs.pop("downscale_factor", None)
 
346
  video_n_hidden = gr.Textbox(label="video_n", visible=False, value=None)
347
  t2v_prompt = gr.Textbox(label="Prompt", value="A majestic dragon flying over a medieval castle", lines=3)
348
  t2v_button = gr.Button("Generate Text-to-Video", variant="primary")
349
+ with gr.Tab("video-to-video", visible=False) as video_tab:
350
  image_v_hidden = gr.Textbox(label="image_v", visible=False, value=None)
351
  video_v2v = gr.Video(label="Input Video", sources=["upload", "webcam"]) # type defaults to filepath
352
  frames_to_use = gr.Slider(label="Frames to use from input video", minimum=9, maximum=MAX_NUM_FRAMES, value=9, step=8, info="Number of initial frames to use for conditioning/transformation. Must be N*8+1.")
 
374
  randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=False)
375
  with gr.Row():
376
  guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
377
+ # Removed steps_input slider
378
+ # default_steps = len(PIPELINE_CONFIG_YAML.get("first_pass", {}).get("timesteps", [1]*7))
379
+ # steps_input = gr.Slider(label="Inference Steps (for first pass if multi-scale)", minimum=1, maximum=30, value=default_steps, step=1, info="Number of denoising steps. More steps can improve quality but increase time. If YAML defines 'timesteps' for a pass, this UI value is ignored for that pass.")
380
  with gr.Row():
381
  height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
382
  width_input = gr.Slider(label="Width", value=704, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
 
448
  # --- INPUT LISTS (remain the same structurally) ---
449
  t2v_inputs = [t2v_prompt, negative_prompt_input, image_n_hidden, video_n_hidden,
450
  height_input, width_input, gr.State("text-to-video"),
451
+ duration_input, gr.State(0), # Removed steps_input
452
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
453
 
454
  i2v_inputs = [i2v_prompt, negative_prompt_input, image_i2v, video_i_hidden,
455
  height_input, width_input, gr.State("image-to-video"),
456
+ duration_input, gr.State(0), # Removed steps_input
457
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
458
 
459
  v2v_inputs = [v2v_prompt, negative_prompt_input, image_v_hidden, video_v2v,
460
  height_input, width_input, gr.State("video-to-video"),
461
+ duration_input, frames_to_use, # Removed steps_input
462
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
463
 
464
  t2v_button.click(fn=generate, inputs=t2v_inputs, outputs=[output_video], api_name="text_to_video")