This Pull Request uses an end frame

#2
.gitattributes CHANGED
@@ -41,3 +41,5 @@ img_examples/Example1.png filter=lfs diff=lfs merge=lfs -text
41
  img_examples/Example2.webp filter=lfs diff=lfs merge=lfs -text
42
  img_examples/Example3.jpg filter=lfs diff=lfs merge=lfs -text
43
  img_examples/Example4.webp filter=lfs diff=lfs merge=lfs -text
 
 
 
41
  img_examples/Example2.webp filter=lfs diff=lfs merge=lfs -text
42
  img_examples/Example3.jpg filter=lfs diff=lfs merge=lfs -text
43
  img_examples/Example4.webp filter=lfs diff=lfs merge=lfs -text
44
+ img_examples/Example5.png filter=lfs diff=lfs merge=lfs -text
45
+ img_examples/Example6.png filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -7,7 +7,12 @@ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.di
7
  try:
8
  import spaces
9
  except:
10
- print("Not on HuggingFace")
 
 
 
 
 
11
  import gradio as gr
12
  import torch
13
  import traceback
@@ -17,6 +22,7 @@ import numpy as np
17
  import random
18
  import time
19
  import math
 
20
  # 20250506 pftq: Added for video input loading
21
  import decord
22
  # 20250506 pftq: Added for progress bars in video_encode
@@ -198,9 +204,6 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
198
  frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
199
  #print(f"Tensor shape: {frames_pt.shape}")
200
 
201
- # 20250507 pftq: Save pixel frames for use in worker
202
- input_video_pixels = frames_pt.cpu()
203
-
204
  # 20250506 pftq: Move to device
205
  #print(f"Moving tensor to device: {device}")
206
  frames_pt = frames_pt.to(device)
@@ -252,7 +255,7 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
252
  torch.cuda.empty_cache()
253
  #print("VAE moved back to CPU, CUDA cache cleared")
254
 
255
- return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
256
 
257
  except Exception as e:
258
  print(f"Error in video_encode: {str(e)}")
@@ -305,8 +308,67 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
305
  print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
306
  return False
307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  @torch.no_grad()
309
- def worker(input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
310
  def encode_prompt(prompt, n_prompt):
311
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
312
 
@@ -401,6 +463,8 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
401
  return [start_latent, image_encoder_last_hidden_state]
402
 
403
  [start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
 
 
404
 
405
  # Dtype
406
 
@@ -412,7 +476,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
412
 
413
  rnd = torch.Generator("cpu").manual_seed(seed)
414
 
415
- history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
416
  start_latent = start_latent.to(history_latents)
417
  history_pixels = None
418
 
@@ -496,7 +560,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
496
  [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
497
 
498
  if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
499
- prompt_parameters[prompt_index] = None
500
 
501
  if not high_vram:
502
  unload_complete_models()
@@ -544,6 +608,13 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
544
  clean_latent_4x_indices=clean_latent_4x_indices,
545
  callback=callback,
546
  )
 
 
 
 
 
 
 
547
 
548
  [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
549
 
@@ -557,7 +628,8 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
557
  real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
558
  zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
559
  history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
560
- real_history_latents = zero_latents = None
 
561
 
562
  forward = True
563
  section_index = first_section_index
@@ -575,9 +647,293 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
575
  stream.output_queue.push(('end', None))
576
  return
577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  # 20250506 pftq: Modified worker to accept video input and clean frame count
579
  @torch.no_grad()
580
- def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
581
  def encode_prompt(prompt, n_prompt):
582
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
583
 
@@ -602,8 +958,9 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
602
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
603
 
604
  # 20250506 pftq: Encode video
605
- start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)[:6]
606
- start_latent = start_latent.to(dtype=torch.float32).cpu()
 
607
  video_latents = video_latents.cpu()
608
 
609
  total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
@@ -640,12 +997,29 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
640
  load_model_as_complete(image_encoder, target_device=gpu)
641
 
642
  image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
 
644
  # Clean GPU
645
  if not high_vram:
646
- unload_complete_models(image_encoder)
647
 
648
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
 
649
 
650
  # Dtype
651
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
@@ -672,7 +1046,13 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
672
  def callback(d):
673
  return
674
 
675
- def compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent):
 
 
 
 
 
 
676
  # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
677
  available_frames = history_latents.shape[2] # Number of latent frames
678
  max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
@@ -686,11 +1066,11 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
686
  total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
687
  total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
688
 
689
- indices = torch.arange(0, 1 + num_4x_frames + num_2x_frames + effective_clean_frames + adjusted_latent_frames).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
690
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
691
- [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
692
  )
693
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
694
 
695
  # 20250506 pftq: Split history_latents dynamically based on available frames
696
  fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
@@ -723,7 +1103,10 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
723
  if effective_clean_frames > 0 and split_idx < len(splits):
724
  clean_latents_1x = splits[split_idx]
725
 
726
- clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
 
 
 
727
 
728
  # 20250507 pftq: Fix for <=1 sec videos.
729
  max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
@@ -745,10 +1128,18 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
745
  history_latents = video_latents
746
  total_generated_latent_frames = history_latents.shape[2]
747
  # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
748
- history_pixels = None
749
- previous_video = None
750
 
751
- for section_index in range(total_latent_sections):
 
 
 
 
 
 
 
 
 
752
  if stream.input_queue.top() == 'end':
753
  stream.output_queue.push(('end', None))
754
  return
@@ -767,7 +1158,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
767
  else:
768
  transformer.initialize_teacache(enable_teacache=False)
769
 
770
- [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
771
 
772
  generated_latents = sample_hunyuan(
773
  transformer=transformer,
@@ -798,6 +1189,13 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
798
  clean_latent_4x_indices=clean_latent_4x_indices,
799
  callback=callback,
800
  )
 
 
 
 
 
 
 
801
 
802
  total_generated_latent_frames += int(generated_latents.shape[2])
803
  history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
@@ -855,18 +1253,17 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
855
  stream.output_queue.push(('end', None))
856
  return
857
 
858
- def get_duration(input_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
859
  return allocation_time
860
 
861
- # Remove this decorator if you run on local
862
  @spaces.GPU(duration=get_duration)
863
- def process_on_gpu(input_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
864
  ):
865
  start = time.time()
866
  global stream
867
  stream = AsyncStream()
868
 
869
- async_run(worker, input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
870
 
871
  output_filename = None
872
 
@@ -892,11 +1289,13 @@ def process_on_gpu(input_image, image_position, prompts, generation_mode, n_prom
892
  ((str(hours) + " h, ") if hours != 0 else "") + \
893
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
894
  str(secondes) + " sec. " + \
895
- "You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
896
  break
897
 
898
  def process(input_image,
 
899
  image_position=0,
 
900
  prompt="",
901
  generation_mode="image",
902
  n_prompt="",
@@ -907,18 +1306,18 @@ def process(input_image,
907
  resolution=640,
908
  total_second_length=5,
909
  latent_window_size=9,
910
- steps=25,
911
  cfg=1.0,
912
  gs=10.0,
913
  rs=0.0,
914
  gpu_memory_preservation=6,
915
- enable_preview=True,
916
  use_teacache=False,
917
  mp4_crf=16,
918
  fps_number=30
919
  ):
920
  if auto_allocation:
921
- allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25)), 600)
922
 
923
  if torch.cuda.device_count() == 0:
924
  gr.Warning('Set this space to GPU config to make it work.')
@@ -930,16 +1329,20 @@ def process(input_image,
930
 
931
  prompts = prompt.split(";")
932
 
933
- # assert input_image is not None, 'No input image!'
934
  if generation_mode == "text":
935
- default_height, default_width = 640, 640
936
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
937
  print("No input image provided. Using a blank white image.")
 
 
938
 
939
  yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
940
 
 
941
  yield from process_on_gpu(input_image,
 
942
  image_position,
 
943
  prompts,
944
  generation_mode,
945
  n_prompt,
@@ -959,18 +1362,17 @@ def process(input_image,
959
  fps_number
960
  )
961
 
962
- def get_duration_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
963
  return allocation_time
964
 
965
- # Remove this decorator if you run on local
966
  @spaces.GPU(duration=get_duration_video)
967
- def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
968
  start = time.time()
969
  global stream
970
  stream = AsyncStream()
971
 
972
  # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
973
- async_run(worker_video, input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
974
 
975
  output_filename = None
976
 
@@ -997,13 +1399,13 @@ def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution
997
  ((str(hours) + " h, ") if hours != 0 else "") + \
998
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
999
  str(secondes) + " sec. " + \
1000
- " You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", '', gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
1001
  break
1002
 
1003
- def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1004
  global high_vram
1005
  if auto_allocation:
1006
- allocation_time = min(total_second_length * 60 * (2.5 if use_teacache else 3.5) * (1 + ((steps - 25) / 25)), 600)
1007
 
1008
  if torch.cuda.device_count() == 0:
1009
  gr.Warning('Set this space to GPU config to make it work.')
@@ -1033,7 +1435,8 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allo
1033
  if cfg > 1:
1034
  gs = 1
1035
 
1036
- yield from process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
 
1037
 
1038
  def end_process():
1039
  stream.input_queue.push('end')
@@ -1103,11 +1506,12 @@ with block:
1103
  local_storage = gr.BrowserState(default_local_storage)
1104
  with gr.Row():
1105
  with gr.Column():
1106
- generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1107
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1108
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1109
  image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
1110
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
 
1111
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
1112
  prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
1113
 
@@ -1131,9 +1535,10 @@ with block:
1131
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1132
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1133
 
1134
- n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1135
 
1136
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
 
1137
 
1138
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
1139
  steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=30, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
@@ -1186,19 +1591,20 @@ with block:
1186
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1187
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1188
 
1189
- # 20250506 pftq: Updated inputs to include num_clean_frames
1190
- ips = [input_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
1191
- ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1192
 
1193
  gr.Examples(
1194
  label = "✍️ Examples from text",
1195
  examples = [
1196
  [
1197
  None, # input_image
 
1198
  0, # image_position
 
1199
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1200
  "text", # generation_mode
1201
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1202
  True, # randomize_seed
1203
  42, # seed
1204
  True, # auto_allocation
@@ -1229,10 +1635,12 @@ with block:
1229
  examples = [
1230
  [
1231
  "./img_examples/Example1.png", # input_image
 
1232
  0, # image_position
 
1233
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1234
  "image", # generation_mode
1235
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1236
  True, # randomize_seed
1237
  42, # seed
1238
  True, # auto_allocation
@@ -1246,16 +1654,18 @@ with block:
1246
  0.0, # rs
1247
  6, # gpu_memory_preservation
1248
  False, # enable_preview
1249
- True, # use_teacache
1250
  16, # mp4_crf
1251
  30 # fps_number
1252
  ],
1253
  [
1254
  "./img_examples/Example2.webp", # input_image
 
1255
  0, # image_position
 
1256
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1257
  "image", # generation_mode
1258
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1259
  True, # randomize_seed
1260
  42, # seed
1261
  True, # auto_allocation
@@ -1269,16 +1679,18 @@ with block:
1269
  0.0, # rs
1270
  6, # gpu_memory_preservation
1271
  False, # enable_preview
1272
- True, # use_teacache
1273
  16, # mp4_crf
1274
  30 # fps_number
1275
  ],
1276
  [
1277
  "./img_examples/Example2.webp", # input_image
 
1278
  0, # image_position
 
1279
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1280
  "image", # generation_mode
1281
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1282
  True, # randomize_seed
1283
  42, # seed
1284
  True, # auto_allocation
@@ -1292,16 +1704,18 @@ with block:
1292
  0.0, # rs
1293
  6, # gpu_memory_preservation
1294
  False, # enable_preview
1295
- True, # use_teacache
1296
  16, # mp4_crf
1297
  30 # fps_number
1298
  ],
1299
  [
1300
  "./img_examples/Example3.jpg", # input_image
 
1301
  0, # image_position
1302
- "A boy is walking to the right, full view, full-length view, cartoon",
 
1303
  "image", # generation_mode
1304
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1305
  True, # randomize_seed
1306
  42, # seed
1307
  True, # auto_allocation
@@ -1315,16 +1729,18 @@ with block:
1315
  0.0, # rs
1316
  6, # gpu_memory_preservation
1317
  False, # enable_preview
1318
- True, # use_teacache
1319
  16, # mp4_crf
1320
  30 # fps_number
1321
  ],
1322
  [
1323
  "./img_examples/Example4.webp", # input_image
 
1324
  100, # image_position
 
1325
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1326
  "image", # generation_mode
1327
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1328
  True, # randomize_seed
1329
  42, # seed
1330
  True, # auto_allocation
@@ -1350,13 +1766,51 @@ with block:
1350
  cache_examples = False,
1351
  )
1352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1353
  gr.Examples(
1354
  label = "🎥 Examples from video",
1355
  examples = [
1356
  [
1357
  "./img_examples/Example1.mp4", # input_video
 
 
1358
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1359
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1360
  True, # randomize_seed
1361
  42, # seed
1362
  True, # auto_allocation
@@ -1371,7 +1825,33 @@ with block:
1371
  0.0, # rs
1372
  6, # gpu_memory_preservation
1373
  False, # enable_preview
1374
- True, # use_teacache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1375
  False, # no_resize
1376
  16, # mp4_crf
1377
  5, # num_clean_frames
@@ -1401,17 +1881,81 @@ with block:
1401
  def check_parameters(generation_mode, input_image, input_video):
1402
  if generation_mode == "image" and input_image is None:
1403
  raise gr.Error("Please provide an image to extend.")
 
 
1404
  if generation_mode == "video" and input_video is None:
1405
  raise gr.Error("Please provide a video to extend.")
1406
  return [gr.update(interactive=True), gr.update(visible = True)]
1407
 
1408
  def handle_generation_mode_change(generation_mode_data):
1409
  if generation_mode_data == "text":
1410
- return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1411
  elif generation_mode_data == "image":
1412
- return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1413
  elif generation_mode_data == "video":
1414
- return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1415
 
1416
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1417
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
@@ -1433,7 +1977,7 @@ with block:
1433
  generation_mode.change(
1434
  fn=handle_generation_mode_change,
1435
  inputs=[generation_mode],
1436
- outputs=[text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
1437
  )
1438
 
1439
  # Update display when the page loads
@@ -1441,7 +1985,7 @@ with block:
1441
  fn=handle_generation_mode_change, inputs = [
1442
  generation_mode
1443
  ], outputs = [
1444
- text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
1445
  ]
1446
  )
1447
 
 
7
  try:
8
  import spaces
9
  except:
10
+ class spaces():
11
+ def GPU(*args, **kwargs):
12
+ def decorator(function):
13
+ return lambda *dummy_args, **dummy_kwargs: function(*dummy_args, **dummy_kwargs)
14
+ return decorator
15
+
16
  import gradio as gr
17
  import torch
18
  import traceback
 
22
  import random
23
  import time
24
  import math
25
+ import gc
26
  # 20250506 pftq: Added for video input loading
27
  import decord
28
  # 20250506 pftq: Added for progress bars in video_encode
 
204
  frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
205
  #print(f"Tensor shape: {frames_pt.shape}")
206
 
 
 
 
207
  # 20250506 pftq: Move to device
208
  #print(f"Moving tensor to device: {device}")
209
  frames_pt = frames_pt.to(device)
 
255
  torch.cuda.empty_cache()
256
  #print("VAE moved back to CPU, CUDA cache cleared")
257
 
258
+ return start_latent, input_image_np, history_latents, fps, target_height, target_width
259
 
260
  except Exception as e:
261
  print(f"Error in video_encode: {str(e)}")
 
308
  print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
309
  return False
310
 
311
+ # 20250507 pftq: New function to encode a single image (end frame)
312
+ @torch.no_grad()
313
+ def image_encode(image_np, target_width, target_height, vae, image_encoder, feature_extractor, device="cuda"):
314
+ """
315
+ Encode a single image into a latent and compute its CLIP vision embedding.
316
+
317
+ Args:
318
+ image_np: Input image as numpy array.
319
+ target_width, target_height: Exact resolution to resize the image to (matches start frame).
320
+ vae: AutoencoderKLHunyuanVideo model.
321
+ image_encoder: SiglipVisionModel for CLIP vision encoding.
322
+ feature_extractor: SiglipImageProcessor for preprocessing.
323
+ device: Device for computation (e.g., "cuda").
324
+
325
+ Returns:
326
+ latent: Latent representation of the image (shape: [1, channels, 1, height//8, width//8]).
327
+ clip_embedding: CLIP vision embedding of the image.
328
+ processed_image_np: Processed image as numpy array (after resizing).
329
+ """
330
+ # 20250507 pftq: Process end frame with exact start frame dimensions
331
+ print("Processing end frame...")
332
+ try:
333
+ print(f"Using exact start frame resolution for end frame: {target_width}x{target_height}")
334
+
335
+ # Resize and preprocess image to match start frame
336
+ processed_image_np = resize_and_center_crop(image_np, target_width=target_width, target_height=target_height)
337
+
338
+ # Convert to tensor and normalize
339
+ image_pt = torch.from_numpy(processed_image_np).float() / 127.5 - 1
340
+ image_pt = image_pt.permute(2, 0, 1).unsqueeze(0).unsqueeze(2) # Shape: [1, channels, 1, height, width]
341
+ image_pt = image_pt.to(device)
342
+
343
+ # Move VAE to device
344
+ vae.to(device)
345
+
346
+ # Encode to latent
347
+ latent = vae_encode(image_pt, vae)
348
+ print(f"image_encode vae output shape: {latent.shape}")
349
+
350
+ # Move image encoder to device
351
+ image_encoder.to(device)
352
+
353
+ # Compute CLIP vision embedding
354
+ clip_embedding = hf_clip_vision_encode(processed_image_np, feature_extractor, image_encoder).last_hidden_state
355
+
356
+ # Move models back to CPU and clear cache
357
+ if device == "cuda":
358
+ vae.to(cpu)
359
+ image_encoder.to(cpu)
360
+ torch.cuda.empty_cache()
361
+ print("VAE and image encoder moved back to CPU, CUDA cache cleared")
362
+
363
+ print(f"End latent shape: {latent.shape}")
364
+ return latent, clip_embedding, processed_image_np
365
+
366
+ except Exception as e:
367
+ print(f"Error in image_encode: {str(e)}")
368
+ raise
369
+
370
  @torch.no_grad()
371
+ def worker(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
372
  def encode_prompt(prompt, n_prompt):
373
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
374
 
 
463
  return [start_latent, image_encoder_last_hidden_state]
464
 
465
  [start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
466
+ del input_image
467
+ del end_image
468
 
469
  # Dtype
470
 
 
476
 
477
  rnd = torch.Generator("cpu").manual_seed(seed)
478
 
479
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
480
  start_latent = start_latent.to(history_latents)
481
  history_pixels = None
482
 
 
560
  [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
561
 
562
  if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
563
+ del prompt_parameters[prompt_index]
564
 
565
  if not high_vram:
566
  unload_complete_models()
 
608
  clean_latent_4x_indices=clean_latent_4x_indices,
609
  callback=callback,
610
  )
611
+ del clean_latents
612
+ del clean_latents_2x
613
+ del clean_latents_4x
614
+ del latent_indices
615
+ del clean_latent_indices
616
+ del clean_latent_2x_indices
617
+ del clean_latent_4x_indices
618
 
619
  [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
620
 
 
628
  real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
629
  zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
630
  history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
631
+ del real_history_latents
632
+ del zero_latents
633
 
634
  forward = True
635
  section_index = first_section_index
 
647
  stream.output_queue.push(('end', None))
648
  return
649
 
650
+ @torch.no_grad()
651
+ def worker_start_end(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
652
+ def encode_prompt(prompt, n_prompt):
653
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
654
+
655
+ if cfg == 1:
656
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
657
+ else:
658
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
659
+
660
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
661
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
662
+
663
+ llama_vec = llama_vec.to(transformer.dtype)
664
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
665
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
666
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
667
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
668
+
669
+ total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
670
+ total_latent_sections = int(max(round(total_latent_sections), 1))
671
+
672
+ job_id = generate_timestamp()
673
+
674
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
675
+
676
+ try:
677
+ # Clean GPU
678
+ if not high_vram:
679
+ unload_complete_models(
680
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
681
+ )
682
+
683
+ # Text encoding
684
+
685
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
686
+
687
+ if not high_vram:
688
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
689
+ load_model_as_complete(text_encoder_2, target_device=gpu)
690
+
691
+
692
+ prompt_parameters = []
693
+
694
+ for prompt_part in prompts[:total_latent_sections]:
695
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
696
+
697
+ # Clean GPU
698
+ if not high_vram:
699
+ unload_complete_models(
700
+ text_encoder, text_encoder_2
701
+ )
702
+
703
+ # Processing input image (start frame)
704
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing start frame ...'))))
705
+
706
+ H, W, C = input_image.shape
707
+ height, width = find_nearest_bucket(H, W, resolution=resolution)
708
+ has_end_image = end_image is not None
709
+
710
+ def get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram):
711
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
712
+
713
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
714
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
715
+
716
+ # Processing end image (if provided)
717
+ if has_end_image:
718
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing end frame ...'))))
719
+
720
+ end_image_np = resize_and_center_crop(end_image, target_width=width, target_height=height)
721
+
722
+ end_image_pt = torch.from_numpy(end_image_np).float() / 127.5 - 1
723
+ end_image_pt = end_image_pt.permute(2, 0, 1)[None, :, None]
724
+
725
+ # VAE encoding
726
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
727
+
728
+ if not high_vram:
729
+ load_model_as_complete(vae, target_device=gpu)
730
+
731
+ start_latent = vae_encode(input_image_pt, vae)
732
+
733
+ if has_end_image:
734
+ end_latent = vae_encode(end_image_pt, vae)
735
+
736
+ # CLIP Vision
737
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
738
+
739
+ if not high_vram:
740
+ load_model_as_complete(image_encoder, target_device=gpu)
741
+
742
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
743
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
744
+
745
+ if has_end_image:
746
+ end_image_encoder_output = hf_clip_vision_encode(end_image_np, feature_extractor, image_encoder)
747
+ end_image_encoder_last_hidden_state = end_image_encoder_output.last_hidden_state
748
+ # Combine both image embeddings or use a weighted approach
749
+ image_encoder_last_hidden_state = (image_encoder_last_hidden_state + end_image_encoder_last_hidden_state) / 2
750
+
751
+ # Clean GPU
752
+ if not high_vram:
753
+ unload_complete_models(
754
+ image_encoder
755
+ )
756
+
757
+ return [start_latent, end_latent, image_encoder_last_hidden_state]
758
+
759
+ [start_latent, end_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram)
760
+ del input_image
761
+ del end_image
762
+
763
+ # Dtype
764
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
765
+
766
+ # Sampling
767
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
768
+
769
+ rnd = torch.Generator("cpu").manual_seed(seed)
770
+ num_frames = latent_window_size * 4 - 3
771
+
772
+ history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32, device=cpu)
773
+ start_latent = start_latent.to(history_latents)
774
+ if has_end_image:
775
+ end_latent = end_latent.to(history_latents)
776
+
777
+ history_pixels = None
778
+ total_generated_latent_frames = 0
779
+
780
+ if total_latent_sections > 4:
781
+ # In theory the latent_paddings should follow the else sequence, but it seems that duplicating some
782
+ # items looks better than expanding it when total_latent_sections > 4
783
+ # One can try to remove below trick and just
784
+ # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
785
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
786
+ else:
787
+ # Convert an iterator to a list
788
+ latent_paddings = list(range(total_latent_sections - 1, -1, -1))
789
+
790
+ if enable_preview:
791
+ def callback(d):
792
+ preview = d['denoised']
793
+ preview = vae_decode_fake(preview)
794
+
795
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
796
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
797
+
798
+ if stream.input_queue.top() == 'end':
799
+ stream.output_queue.push(('end', None))
800
+ raise KeyboardInterrupt('User ends the task.')
801
+
802
+ current_step = d['i'] + 1
803
+ percentage = int(100.0 * current_step / steps)
804
+ hint = f'Sampling {current_step}/{steps}'
805
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
806
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
807
+ return
808
+ else:
809
+ def callback(d):
810
+ return
811
+
812
+ def post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section):
813
+ if is_last_section:
814
+ generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
815
+
816
+ total_generated_latent_frames += int(generated_latents.shape[2])
817
+ history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
818
+
819
+ if not high_vram:
820
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
821
+ load_model_as_complete(vae, target_device=gpu)
822
+
823
+ if history_pixels is None:
824
+ history_pixels = vae_decode(history_latents[:, :, :total_generated_latent_frames, :, :], vae).cpu()
825
+ else:
826
+ section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
827
+ overlapped_frames = latent_window_size * 4 - 3
828
+
829
+ current_pixels = vae_decode(history_latents[:, :, :min(total_generated_latent_frames, section_latent_frames)], vae).cpu()
830
+ history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
831
+
832
+ if not high_vram:
833
+ unload_complete_models(vae)
834
+
835
+ if enable_preview or is_last_section:
836
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
837
+
838
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
839
+
840
+ print(f'Decoded. Pixel shape {history_pixels.shape}')
841
+
842
+ stream.output_queue.push(('file', output_filename))
843
+
844
+ return [total_generated_latent_frames, history_latents, history_pixels]
845
+
846
+ for latent_padding in latent_paddings:
847
+ is_last_section = latent_padding == 0
848
+ is_first_section = latent_padding == latent_paddings[0]
849
+ latent_padding_size = latent_padding * latent_window_size
850
+
851
+ if stream.input_queue.top() == 'end':
852
+ stream.output_queue.push(('end', None))
853
+ return
854
+
855
+ print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
856
+
857
+ if len(prompt_parameters) > 0:
858
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
859
+
860
+ indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + (end_stillness if is_first_section else 0) + 2 + 16).unsqueeze(0)
861
+ clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1 + (end_stillness if is_first_section else 0), 2, 16], dim=1)
862
+ clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
863
+
864
+ clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
865
+
866
+ # Use end image latent for the first section if provided
867
+ if has_end_image and is_first_section:
868
+ clean_latents_post = end_latent.expand(-1, -1, 1 + end_stillness, -1, -1)
869
+
870
+ clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
871
+
872
+ if not high_vram:
873
+ unload_complete_models()
874
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
875
+
876
+ if use_teacache:
877
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
878
+ else:
879
+ transformer.initialize_teacache(enable_teacache=False)
880
+
881
+ generated_latents = sample_hunyuan(
882
+ transformer=transformer,
883
+ sampler='unipc',
884
+ width=width,
885
+ height=height,
886
+ frames=num_frames,
887
+ real_guidance_scale=cfg,
888
+ distilled_guidance_scale=gs,
889
+ guidance_rescale=rs,
890
+ # shift=3.0,
891
+ num_inference_steps=steps,
892
+ generator=rnd,
893
+ prompt_embeds=llama_vec,
894
+ prompt_embeds_mask=llama_attention_mask,
895
+ prompt_poolers=clip_l_pooler,
896
+ negative_prompt_embeds=llama_vec_n,
897
+ negative_prompt_embeds_mask=llama_attention_mask_n,
898
+ negative_prompt_poolers=clip_l_pooler_n,
899
+ device=gpu,
900
+ dtype=torch.bfloat16,
901
+ image_embeddings=image_encoder_last_hidden_state,
902
+ latent_indices=latent_indices,
903
+ clean_latents=clean_latents,
904
+ clean_latent_indices=clean_latent_indices,
905
+ clean_latents_2x=clean_latents_2x,
906
+ clean_latent_2x_indices=clean_latent_2x_indices,
907
+ clean_latents_4x=clean_latents_4x,
908
+ clean_latent_4x_indices=clean_latent_4x_indices,
909
+ callback=callback,
910
+ )
911
+ del clean_latents
912
+ del clean_latents_2x
913
+ del clean_latents_4x
914
+ del latent_indices
915
+ del clean_latent_indices
916
+ del clean_latent_2x_indices
917
+ del clean_latent_4x_indices
918
+
919
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
920
+
921
+ if is_last_section:
922
+ break
923
+ except:
924
+ traceback.print_exc()
925
+
926
+ if not high_vram:
927
+ unload_complete_models(
928
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
929
+ )
930
+
931
+ stream.output_queue.push(('end', None))
932
+ return
933
+
934
  # 20250506 pftq: Modified worker to accept video input and clean frame count
935
  @torch.no_grad()
936
+ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
937
  def encode_prompt(prompt, n_prompt):
938
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
939
 
 
958
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
959
 
960
  # 20250506 pftq: Encode video
961
+ start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
962
+ del input_video
963
+ start_latent = start_latent.to(dtype=torch.float32, device=cpu)
964
  video_latents = video_latents.cpu()
965
 
966
  total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
 
997
  load_model_as_complete(image_encoder, target_device=gpu)
998
 
999
  image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
1000
+ del input_image_np
1001
+
1002
+ # 20250507 pftq: Process end frame if provided
1003
+ if end_frame is not None:
1004
+ if not high_vram:
1005
+ load_model_as_complete(vae, target_device=gpu)
1006
+
1007
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'End frame encoding ...'))))
1008
+ end_latent = image_encode(
1009
+ end_frame, target_width=width, target_height=height, vae=vae,
1010
+ image_encoder=image_encoder, feature_extractor=feature_extractor, device=gpu
1011
+ )[0]
1012
+ del end_frame
1013
+ end_latent = end_latent.to(dtype=torch.float32, device=cpu)
1014
+ else:
1015
+ end_latent = None
1016
 
1017
  # Clean GPU
1018
  if not high_vram:
1019
+ unload_complete_models(image_encoder, vae)
1020
 
1021
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
1022
+ del image_encoder_output
1023
 
1024
  # Dtype
1025
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
 
1046
  def callback(d):
1047
  return
1048
 
1049
+ def compute_latent(history_latents, latent_window_size, latent_padding_size, num_clean_frames, start_latent, end_latent, end_stillness, is_end_of_video):
1050
+ if end_latent is not None and is_end_of_video:
1051
+ local_end_stillness = end_stillness
1052
+ local_end_latent = end_latent.expand(-1, -1, 1 + local_end_stillness, -1, -1)
1053
+ else:
1054
+ local_end_stillness = 0
1055
+ local_end_latent = end_latent
1056
  # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
1057
  available_frames = history_latents.shape[2] # Number of latent frames
1058
  max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
 
1066
  total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
1067
  total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
1068
 
1069
+ indices = torch.arange(0, 1 + num_4x_frames + num_2x_frames + effective_clean_frames + adjusted_latent_frames + ((latent_padding_size + 1 + local_end_stillness) if end_latent is not None else 0)).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
1070
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices, blank_indices, clean_latent_indices_post = indices.split(
1071
+ [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames, latent_padding_size if end_latent is not None else 0, (1 + local_end_stillness) if end_latent is not None else 0], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
1072
  )
1073
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices, clean_latent_indices_post], dim=1)
1074
 
1075
  # 20250506 pftq: Split history_latents dynamically based on available frames
1076
  fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
 
1103
  if effective_clean_frames > 0 and split_idx < len(splits):
1104
  clean_latents_1x = splits[split_idx]
1105
 
1106
+ if end_latent is not None:
1107
+ clean_latents = torch.cat([start_latent, clean_latents_1x, local_end_latent], dim=2)
1108
+ else:
1109
+ clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
1110
 
1111
  # 20250507 pftq: Fix for <=1 sec videos.
1112
  max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
 
1128
  history_latents = video_latents
1129
  total_generated_latent_frames = history_latents.shape[2]
1130
  # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
1131
+ history_pixels = previous_video = None
 
1132
 
1133
+ # 20250509 Generate backwards with end frame for better end frame anchoring
1134
+ if total_latent_sections > 4:
1135
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
1136
+ else:
1137
+ latent_paddings = list(reversed(range(total_latent_sections)))
1138
+
1139
+ for section_index, latent_padding in enumerate(latent_paddings):
1140
+ is_start_of_video = latent_padding == 0
1141
+ is_end_of_video = latent_padding == latent_paddings[0]
1142
+ latent_padding_size = latent_padding * latent_window_size
1143
  if stream.input_queue.top() == 'end':
1144
  stream.output_queue.push(('end', None))
1145
  return
 
1158
  else:
1159
  transformer.initialize_teacache(enable_teacache=False)
1160
 
1161
+ [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, latent_padding_size, num_clean_frames, start_latent, end_latent, end_stillness, is_end_of_video)
1162
 
1163
  generated_latents = sample_hunyuan(
1164
  transformer=transformer,
 
1189
  clean_latent_4x_indices=clean_latent_4x_indices,
1190
  callback=callback,
1191
  )
1192
+ del clean_latents
1193
+ del clean_latents_2x
1194
+ del clean_latents_4x
1195
+ del latent_indices
1196
+ del clean_latent_indices
1197
+ del clean_latent_2x_indices
1198
+ del clean_latent_4x_indices
1199
 
1200
  total_generated_latent_frames += int(generated_latents.shape[2])
1201
  history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
 
1253
  stream.output_queue.push(('end', None))
1254
  return
1255
 
1256
+ def get_duration(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
1257
  return allocation_time
1258
 
 
1259
  @spaces.GPU(duration=get_duration)
1260
+ def process_on_gpu(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
1261
  ):
1262
  start = time.time()
1263
  global stream
1264
  stream = AsyncStream()
1265
 
1266
+ async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
1267
 
1268
  output_filename = None
1269
 
 
1289
  ((str(hours) + " h, ") if hours != 0 else "") + \
1290
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
1291
  str(secondes) + " sec. " + \
1292
+ "You can upscale the result with https://huggingface.co/spaces/Nick088/Real-ESRGAN_Pytorch. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
1293
  break
1294
 
1295
  def process(input_image,
1296
+ end_image,
1297
  image_position=0,
1298
+ end_stillness=1,
1299
  prompt="",
1300
  generation_mode="image",
1301
  n_prompt="",
 
1306
  resolution=640,
1307
  total_second_length=5,
1308
  latent_window_size=9,
1309
+ steps=30,
1310
  cfg=1.0,
1311
  gs=10.0,
1312
  rs=0.0,
1313
  gpu_memory_preservation=6,
1314
+ enable_preview=False,
1315
  use_teacache=False,
1316
  mp4_crf=16,
1317
  fps_number=30
1318
  ):
1319
  if auto_allocation:
1320
+ allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25))**2, 600)
1321
 
1322
  if torch.cuda.device_count() == 0:
1323
  gr.Warning('Set this space to GPU config to make it work.')
 
1329
 
1330
  prompts = prompt.split(";")
1331
 
 
1332
  if generation_mode == "text":
1333
+ default_height, default_width = resolution, resolution
1334
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
1335
  print("No input image provided. Using a blank white image.")
1336
+ assert input_image is not None, 'No input image!'
1337
+ assert (generation_mode != "start_end") or end_image is not None, 'No end image!'
1338
 
1339
  yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1340
 
1341
+ gc.collect()
1342
  yield from process_on_gpu(input_image,
1343
+ end_image,
1344
  image_position,
1345
+ end_stillness,
1346
  prompts,
1347
  generation_mode,
1348
  n_prompt,
 
1362
  fps_number
1363
  )
1364
 
1365
+ def get_duration_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1366
  return allocation_time
1367
 
 
1368
  @spaces.GPU(duration=get_duration_video)
1369
+ def process_video_on_gpu(input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1370
  start = time.time()
1371
  global stream
1372
  stream = AsyncStream()
1373
 
1374
  # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
1375
+ async_run(worker_video, input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1376
 
1377
  output_filename = None
1378
 
 
1399
  ((str(hours) + " h, ") if hours != 0 else "") + \
1400
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
1401
  str(secondes) + " sec. " + \
1402
+ " You can upscale the result with https://huggingface.co/spaces/Nick088/Real-ESRGAN_Pytorch. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", '', gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
1403
  break
1404
 
1405
+ def process_video(input_video, end_frame, end_stillness, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1406
  global high_vram
1407
  if auto_allocation:
1408
+ allocation_time = min(total_second_length * 60 * (2.5 if use_teacache else 3.5) * (1 + ((steps - 25) / 25))**2, 600)
1409
 
1410
  if torch.cuda.device_count() == 0:
1411
  gr.Warning('Set this space to GPU config to make it work.')
 
1435
  if cfg > 1:
1436
  gs = 1
1437
 
1438
+ gc.collect()
1439
+ yield from process_video_on_gpu(input_video, end_frame, end_stillness, prompt, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1440
 
1441
  def end_process():
1442
  stream.input_queue.push('end')
 
1506
  local_storage = gr.BrowserState(default_local_storage)
1507
  with gr.Row():
1508
  with gr.Column():
1509
+ generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Start & end frames", "start_end"], ["Video Extension", "video"]], elem_id="generation-mode", label="Input mode", value = "image")
1510
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1511
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1512
  image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
1513
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1514
+ end_image = gr.Image(sources='upload', type="numpy", label="End Frame (optional)", height=320)
1515
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
1516
  prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
1517
 
 
1535
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1536
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1537
 
1538
+ n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1539
 
1540
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
1541
+ end_stillness = gr.Slider(label="End stillness", minimum=0, maximum=100, value=0, step=1, info='0=Realistic end; >0=Matches exactly the end image (but the time seems to freeze)')
1542
 
1543
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
1544
  steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=30, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
 
1591
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1592
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1593
 
1594
+ ips = [input_image, end_image, image_position, end_stillness, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
1595
+ ips_video = [input_video, end_image, end_stillness, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
 
1596
 
1597
  gr.Examples(
1598
  label = "✍️ Examples from text",
1599
  examples = [
1600
  [
1601
  None, # input_image
1602
+ None, # end_image
1603
  0, # image_position
1604
+ 1, # end_stillness
1605
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1606
  "text", # generation_mode
1607
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1608
  True, # randomize_seed
1609
  42, # seed
1610
  True, # auto_allocation
 
1635
  examples = [
1636
  [
1637
  "./img_examples/Example1.png", # input_image
1638
+ None, # end_image
1639
  0, # image_position
1640
+ 1, # end_stillness
1641
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1642
  "image", # generation_mode
1643
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1644
  True, # randomize_seed
1645
  42, # seed
1646
  True, # auto_allocation
 
1654
  0.0, # rs
1655
  6, # gpu_memory_preservation
1656
  False, # enable_preview
1657
+ False, # use_teacache
1658
  16, # mp4_crf
1659
  30 # fps_number
1660
  ],
1661
  [
1662
  "./img_examples/Example2.webp", # input_image
1663
+ None, # end_image
1664
  0, # image_position
1665
+ 1, # end_stillness
1666
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1667
  "image", # generation_mode
1668
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1669
  True, # randomize_seed
1670
  42, # seed
1671
  True, # auto_allocation
 
1679
  0.0, # rs
1680
  6, # gpu_memory_preservation
1681
  False, # enable_preview
1682
+ False, # use_teacache
1683
  16, # mp4_crf
1684
  30 # fps_number
1685
  ],
1686
  [
1687
  "./img_examples/Example2.webp", # input_image
1688
+ None, # end_image
1689
  0, # image_position
1690
+ 1, # end_stillness
1691
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1692
  "image", # generation_mode
1693
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1694
  True, # randomize_seed
1695
  42, # seed
1696
  True, # auto_allocation
 
1704
  0.0, # rs
1705
  6, # gpu_memory_preservation
1706
  False, # enable_preview
1707
+ False, # use_teacache
1708
  16, # mp4_crf
1709
  30 # fps_number
1710
  ],
1711
  [
1712
  "./img_examples/Example3.jpg", # input_image
1713
+ None, # end_image
1714
  0, # image_position
1715
+ 1, # end_stillness
1716
+ "एउटा केटा दायाँतिर हिँडिरहेको छ, पूर्ण दृश्य, पूर्ण-लम्बाइको दृश्य, कार्टुन",
1717
  "image", # generation_mode
1718
+ "हात छुटेको, लामो हात, अवास्तविक स्थिति, असम्भव विकृति, देखिने हड्डी, मांसपेशी संकुचन, कमजोर फ्रेम, धमिलो, धमिलो, अत्यधिक चिल्लो", # n_prompt
1719
  True, # randomize_seed
1720
  42, # seed
1721
  True, # auto_allocation
 
1729
  0.0, # rs
1730
  6, # gpu_memory_preservation
1731
  False, # enable_preview
1732
+ False, # use_teacache
1733
  16, # mp4_crf
1734
  30 # fps_number
1735
  ],
1736
  [
1737
  "./img_examples/Example4.webp", # input_image
1738
+ None, # end_image
1739
  100, # image_position
1740
+ 1, # end_stillness
1741
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1742
  "image", # generation_mode
1743
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1744
  True, # randomize_seed
1745
  42, # seed
1746
  True, # auto_allocation
 
1766
  cache_examples = False,
1767
  )
1768
 
1769
+ gr.Examples(
1770
+ label = "🖼️ Examples from start and end frames",
1771
+ examples = [
1772
+ [
1773
+ "./img_examples/Example5.png", # input_image
1774
+ "./img_examples/Example6.png", # end_image
1775
+ 0, # image_position
1776
+ 0, # end_stillness
1777
+ "A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k", # prompt
1778
+ "start_end", # generation_mode
1779
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
1780
+ True, # randomize_seed
1781
+ 42, # seed
1782
+ True, # auto_allocation
1783
+ 180, # allocation_time
1784
+ 672, # resolution
1785
+ 1, # total_second_length
1786
+ 9, # latent_window_size
1787
+ 30, # steps
1788
+ 1.0, # cfg
1789
+ 10.0, # gs
1790
+ 0.0, # rs
1791
+ 6, # gpu_memory_preservation
1792
+ False, # enable_preview
1793
+ False, # use_teacache
1794
+ 16, # mp4_crf
1795
+ 30 # fps_number
1796
+ ],
1797
+ ],
1798
+ run_on_click = True,
1799
+ fn = process,
1800
+ inputs = ips,
1801
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1802
+ cache_examples = False,
1803
+ )
1804
+
1805
  gr.Examples(
1806
  label = "🎥 Examples from video",
1807
  examples = [
1808
  [
1809
  "./img_examples/Example1.mp4", # input_video
1810
+ None, # end_image
1811
+ 1, # end_stillness
1812
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1813
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
1814
  True, # randomize_seed
1815
  42, # seed
1816
  True, # auto_allocation
 
1825
  0.0, # rs
1826
  6, # gpu_memory_preservation
1827
  False, # enable_preview
1828
+ False, # use_teacache
1829
+ False, # no_resize
1830
+ 16, # mp4_crf
1831
+ 5, # num_clean_frames
1832
+ default_vae
1833
+ ],
1834
+ [
1835
+ "./img_examples/Example1.mp4", # input_video
1836
+ "./img_examples/Example1.png", # end_image
1837
+ 1, # end_stillness
1838
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1839
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
1840
+ True, # randomize_seed
1841
+ 42, # seed
1842
+ True, # auto_allocation
1843
+ 180, # allocation_time
1844
+ 1, # batch
1845
+ 672, # resolution
1846
+ 1, # total_second_length
1847
+ 9, # latent_window_size
1848
+ 30, # steps
1849
+ 1.0, # cfg
1850
+ 10.0, # gs
1851
+ 0.0, # rs
1852
+ 6, # gpu_memory_preservation
1853
+ False, # enable_preview
1854
+ False, # use_teacache
1855
  False, # no_resize
1856
  16, # mp4_crf
1857
  5, # num_clean_frames
 
1881
  def check_parameters(generation_mode, input_image, input_video):
1882
  if generation_mode == "image" and input_image is None:
1883
  raise gr.Error("Please provide an image to extend.")
1884
+ if generation_mode == "start_end" and input_image is None:
1885
+ raise gr.Error("Please provide an image to extend.")
1886
  if generation_mode == "video" and input_video is None:
1887
  raise gr.Error("Please provide a video to extend.")
1888
  return [gr.update(interactive=True), gr.update(visible = True)]
1889
 
1890
  def handle_generation_mode_change(generation_mode_data):
1891
  if generation_mode_data == "text":
1892
+ return [
1893
+ gr.update(visible = True), # text_to_video_hint
1894
+ gr.update(visible = False), # image_position
1895
+ gr.update(visible = False), # input_image
1896
+ gr.update(visible = False), # end_image
1897
+ gr.update(visible = False), # end_stillness
1898
+ gr.update(visible = False), # input_video
1899
+ gr.update(visible = True), # start_button
1900
+ gr.update(visible = False), # start_button_video
1901
+ gr.update(visible = False), # no_resize
1902
+ gr.update(visible = False), # batch
1903
+ gr.update(visible = False), # num_clean_frames
1904
+ gr.update(visible = False), # vae_batch
1905
+ gr.update(visible = False), # prompt_hint
1906
+ gr.update(visible = True) # fps_number
1907
+ ]
1908
  elif generation_mode_data == "image":
1909
+ return [
1910
+ gr.update(visible = False), # text_to_video_hint
1911
+ gr.update(visible = True), # image_position
1912
+ gr.update(visible = True), # input_image
1913
+ gr.update(visible = False), # end_image
1914
+ gr.update(visible = False), # end_stillness
1915
+ gr.update(visible = False), # input_video
1916
+ gr.update(visible = True), # start_button
1917
+ gr.update(visible = False), # start_button_video
1918
+ gr.update(visible = False), # no_resize
1919
+ gr.update(visible = False), # batch
1920
+ gr.update(visible = False), # num_clean_frames
1921
+ gr.update(visible = False), # vae_batch
1922
+ gr.update(visible = False), # prompt_hint
1923
+ gr.update(visible = True) # fps_number
1924
+ ]
1925
+ elif generation_mode_data == "start_end":
1926
+ return [
1927
+ gr.update(visible = False), # text_to_video_hint
1928
+ gr.update(visible = False), # image_position
1929
+ gr.update(visible = True), # input_image
1930
+ gr.update(visible = True), # end_image
1931
+ gr.update(visible = True), # end_stillness
1932
+ gr.update(visible = False), # input_video
1933
+ gr.update(visible = True), # start_button
1934
+ gr.update(visible = False), # start_button_video
1935
+ gr.update(visible = False), # no_resize
1936
+ gr.update(visible = False), # batch
1937
+ gr.update(visible = False), # num_clean_frames
1938
+ gr.update(visible = False), # vae_batch
1939
+ gr.update(visible = False), # prompt_hint
1940
+ gr.update(visible = True) # fps_number
1941
+ ]
1942
  elif generation_mode_data == "video":
1943
+ return [
1944
+ gr.update(visible = False), # text_to_video_hint
1945
+ gr.update(visible = False), # image_position
1946
+ gr.update(visible = False), # input_image
1947
+ gr.update(visible = True), # end_image
1948
+ gr.update(visible = True), # end_stillness
1949
+ gr.update(visible = True), # input_video
1950
+ gr.update(visible = False), # start_button
1951
+ gr.update(visible = True), # start_button_video
1952
+ gr.update(visible = True), # no_resize
1953
+ gr.update(visible = True), # batch
1954
+ gr.update(visible = True), # num_clean_frames
1955
+ gr.update(visible = True), # vae_batch
1956
+ gr.update(visible = True), # prompt_hint
1957
+ gr.update(visible = False) # fps_number
1958
+ ]
1959
 
1960
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1961
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
 
1977
  generation_mode.change(
1978
  fn=handle_generation_mode_change,
1979
  inputs=[generation_mode],
1980
+ outputs=[text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
1981
  )
1982
 
1983
  # Update display when the page loads
 
1985
  fn=handle_generation_mode_change, inputs = [
1986
  generation_mode
1987
  ], outputs = [
1988
+ text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
1989
  ]
1990
  )
1991
 
app_endframe.py DELETED
@@ -1,822 +0,0 @@
1
- from diffusers_helper.hf_login import login
2
-
3
- import os
4
-
5
- os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
-
7
- import gradio as gr
8
- import torch
9
- import traceback
10
- import einops
11
- import safetensors.torch as sf
12
- import numpy as np
13
- import argparse
14
- import random
15
- import math
16
- # 20250506 pftq: Added for video input loading
17
- import decord
18
- # 20250506 pftq: Added for progress bars in video_encode
19
- from tqdm import tqdm
20
- # 20250506 pftq: Normalize file paths for Windows compatibility
21
- import pathlib
22
- # 20250506 pftq: for easier to read timestamp
23
- from datetime import datetime
24
- # 20250508 pftq: for saving prompt to mp4 comments metadata
25
- import imageio_ffmpeg
26
- import tempfile
27
- import shutil
28
- import subprocess
29
- import spaces
30
- from PIL import Image
31
- from diffusers import AutoencoderKLHunyuanVideo
32
- from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
33
- from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
34
- from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
35
- from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
36
- from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
37
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
38
- from diffusers_helper.thread_utils import AsyncStream, async_run
39
- from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
40
- from transformers import SiglipImageProcessor, SiglipVisionModel
41
- from diffusers_helper.clip_vision import hf_clip_vision_encode
42
- from diffusers_helper.bucket_tools import find_nearest_bucket
43
-
44
- parser = argparse.ArgumentParser()
45
- parser.add_argument('--share', action='store_true')
46
- parser.add_argument("--server", type=str, default='0.0.0.0')
47
- parser.add_argument("--port", type=int, required=False)
48
- parser.add_argument("--inbrowser", action='store_true')
49
- args = parser.parse_args()
50
-
51
- print(args)
52
-
53
- free_mem_gb = get_cuda_free_memory_gb(gpu)
54
- high_vram = free_mem_gb > 60
55
-
56
- print(f'Free VRAM {free_mem_gb} GB')
57
- print(f'High-VRAM Mode: {high_vram}')
58
-
59
- text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
60
- text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
61
- tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
62
- tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
63
- vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
64
-
65
- feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
66
- image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
67
-
68
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
69
-
70
- vae.eval()
71
- text_encoder.eval()
72
- text_encoder_2.eval()
73
- image_encoder.eval()
74
- transformer.eval()
75
-
76
- if not high_vram:
77
- vae.enable_slicing()
78
- vae.enable_tiling()
79
-
80
- transformer.high_quality_fp32_output_for_inference = True
81
- print('transformer.high_quality_fp32_output_for_inference = True')
82
-
83
- transformer.to(dtype=torch.bfloat16)
84
- vae.to(dtype=torch.float16)
85
- image_encoder.to(dtype=torch.float16)
86
- text_encoder.to(dtype=torch.float16)
87
- text_encoder_2.to(dtype=torch.float16)
88
-
89
- vae.requires_grad_(False)
90
- text_encoder.requires_grad_(False)
91
- text_encoder_2.requires_grad_(False)
92
- image_encoder.requires_grad_(False)
93
- transformer.requires_grad_(False)
94
-
95
- if not high_vram:
96
- # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
97
- DynamicSwapInstaller.install_model(transformer, device=gpu)
98
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
99
- else:
100
- text_encoder.to(gpu)
101
- text_encoder_2.to(gpu)
102
- image_encoder.to(gpu)
103
- vae.to(gpu)
104
- transformer.to(gpu)
105
-
106
- stream = AsyncStream()
107
-
108
- outputs_folder = './outputs/'
109
- os.makedirs(outputs_folder, exist_ok=True)
110
-
111
- # 20250506 pftq: Added function to encode input video frames into latents
112
- @torch.no_grad()
113
- def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
114
- """
115
- Encode a video into latent representations using the VAE.
116
-
117
- Args:
118
- video_path: Path to the input video file.
119
- vae: AutoencoderKLHunyuanVideo model.
120
- height, width: Target resolution for resizing frames.
121
- vae_batch_size: Number of frames to process per batch.
122
- device: Device for computation (e.g., "cuda").
123
-
124
- Returns:
125
- start_latent: Latent of the first frame (for compatibility with original code).
126
- input_image_np: First frame as numpy array (for CLIP vision encoding).
127
- history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
128
- fps: Frames per second of the input video.
129
- """
130
- # 20250506 pftq: Normalize video path for Windows compatibility
131
- video_path = str(pathlib.Path(video_path).resolve())
132
- print(f"Processing video: {video_path}")
133
-
134
- # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
135
- if device == "cuda" and not torch.cuda.is_available():
136
- print("CUDA is not available, falling back to CPU")
137
- device = "cpu"
138
-
139
- try:
140
- # 20250506 pftq: Load video and get FPS
141
- print("Initializing VideoReader...")
142
- vr = decord.VideoReader(video_path)
143
- fps = vr.get_avg_fps() # Get input video FPS
144
- num_real_frames = len(vr)
145
- print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
146
-
147
- # Truncate to nearest latent size (multiple of 4)
148
- latent_size_factor = 4
149
- num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
150
- if num_frames != num_real_frames:
151
- print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
152
- num_real_frames = num_frames
153
-
154
- # 20250506 pftq: Read frames
155
- print("Reading video frames...")
156
- frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
157
- print(f"Frames read: {frames.shape}")
158
-
159
- # 20250506 pftq: Get native video resolution
160
- native_height, native_width = frames.shape[1], frames.shape[2]
161
- print(f"Native video resolution: {native_width}x{native_height}")
162
-
163
- # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
164
- target_height = native_height if height is None else height
165
- target_width = native_width if width is None else width
166
-
167
- # 20250506 pftq: Adjust to nearest bucket for model compatibility
168
- if not no_resize:
169
- target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
170
- print(f"Adjusted resolution: {target_width}x{target_height}")
171
- else:
172
- print(f"Using native resolution without resizing: {target_width}x{target_height}")
173
-
174
- # 20250506 pftq: Preprocess frames to match original image processing
175
- processed_frames = []
176
- for i, frame in enumerate(frames):
177
- #print(f"Preprocessing frame {i+1}/{num_frames}")
178
- frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
179
- processed_frames.append(frame_np)
180
- processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
181
- print(f"Frames preprocessed: {processed_frames.shape}")
182
-
183
- # 20250506 pftq: Save first frame for CLIP vision encoding
184
- input_image_np = processed_frames[0]
185
- end_of_input_video_image_np = processed_frames[-1]
186
-
187
- # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
188
- print("Converting frames to tensor...")
189
- frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
190
- frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
191
- frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
192
- frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
193
- print(f"Tensor shape: {frames_pt.shape}")
194
-
195
- # 20250507 pftq: Save pixel frames for use in worker
196
- input_video_pixels = frames_pt.cpu()
197
-
198
- # 20250506 pftq: Move to device
199
- print(f"Moving tensor to device: {device}")
200
- frames_pt = frames_pt.to(device)
201
- print("Tensor moved to device")
202
-
203
- # 20250506 pftq: Move VAE to device
204
- print(f"Moving VAE to device: {device}")
205
- vae.to(device)
206
- print("VAE moved to device")
207
-
208
- # 20250506 pftq: Encode frames in batches
209
- print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
210
- latents = []
211
- vae.eval()
212
- with torch.no_grad():
213
- for i in tqdm(range(0, frames_pt.shape[2], vae_batch_size), desc="Encoding video frames", mininterval=0.1):
214
- #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
215
- batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
216
- try:
217
- # 20250506 pftq: Log GPU memory before encoding
218
- if device == "cuda":
219
- free_mem = torch.cuda.memory_allocated() / 1024**3
220
- #print(f"GPU memory before encoding: {free_mem:.2f} GB")
221
- batch_latent = vae_encode(batch, vae)
222
- # 20250506 pftq: Synchronize CUDA to catch issues
223
- if device == "cuda":
224
- torch.cuda.synchronize()
225
- #print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
226
- latents.append(batch_latent)
227
- #print(f"Batch encoded, latent shape: {batch_latent.shape}")
228
- except RuntimeError as e:
229
- print(f"Error during VAE encoding: {str(e)}")
230
- if device == "cuda" and "out of memory" in str(e).lower():
231
- print("CUDA out of memory, try reducing vae_batch_size or using CPU")
232
- raise
233
-
234
- # 20250506 pftq: Concatenate latents
235
- print("Concatenating latents...")
236
- history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
237
- print(f"History latents shape: {history_latents.shape}")
238
-
239
- # 20250506 pftq: Get first frame's latent
240
- start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
241
- end_of_input_video_latent = history_latents[:, :, -1:] # Shape: (1, channels, 1, height//8, width//8)
242
- print(f"Start latent shape: {start_latent.shape}")
243
-
244
- # 20250506 pftq: Move VAE back to CPU to free GPU memory
245
- if device == "cuda":
246
- vae.to(cpu)
247
- torch.cuda.empty_cache()
248
- print("VAE moved back to CPU, CUDA cache cleared")
249
-
250
- return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels, end_of_input_video_latent, end_of_input_video_image_np
251
-
252
- except Exception as e:
253
- print(f"Error in video_encode: {str(e)}")
254
- raise
255
-
256
-
257
- # 20250507 pftq: New function to encode a single image (end frame)
258
- @torch.no_grad()
259
- def image_encode(image_np, target_width, target_height, vae, image_encoder, feature_extractor, device="cuda"):
260
- """
261
- Encode a single image into a latent and compute its CLIP vision embedding.
262
-
263
- Args:
264
- image_np: Input image as numpy array.
265
- target_width, target_height: Exact resolution to resize the image to (matches start frame).
266
- vae: AutoencoderKLHunyuanVideo model.
267
- image_encoder: SiglipVisionModel for CLIP vision encoding.
268
- feature_extractor: SiglipImageProcessor for preprocessing.
269
- device: Device for computation (e.g., "cuda").
270
-
271
- Returns:
272
- latent: Latent representation of the image (shape: [1, channels, 1, height//8, width//8]).
273
- clip_embedding: CLIP vision embedding of the image.
274
- processed_image_np: Processed image as numpy array (after resizing).
275
- """
276
- # 20250507 pftq: Process end frame with exact start frame dimensions
277
- print("Processing end frame...")
278
- try:
279
- print(f"Using exact start frame resolution for end frame: {target_width}x{target_height}")
280
-
281
- # Resize and preprocess image to match start frame
282
- processed_image_np = resize_and_center_crop(image_np, target_width=target_width, target_height=target_height)
283
-
284
- # Convert to tensor and normalize
285
- image_pt = torch.from_numpy(processed_image_np).float() / 127.5 - 1
286
- image_pt = image_pt.permute(2, 0, 1).unsqueeze(0).unsqueeze(2) # Shape: [1, channels, 1, height, width]
287
- image_pt = image_pt.to(device)
288
-
289
- # Move VAE to device
290
- vae.to(device)
291
-
292
- # Encode to latent
293
- latent = vae_encode(image_pt, vae)
294
- print(f"image_encode vae output shape: {latent.shape}")
295
-
296
- # Move image encoder to device
297
- image_encoder.to(device)
298
-
299
- # Compute CLIP vision embedding
300
- clip_embedding = hf_clip_vision_encode(processed_image_np, feature_extractor, image_encoder).last_hidden_state
301
-
302
- # Move models back to CPU and clear cache
303
- if device == "cuda":
304
- vae.to(cpu)
305
- image_encoder.to(cpu)
306
- torch.cuda.empty_cache()
307
- print("VAE and image encoder moved back to CPU, CUDA cache cleared")
308
-
309
- print(f"End latent shape: {latent.shape}")
310
- return latent, clip_embedding, processed_image_np
311
-
312
- except Exception as e:
313
- print(f"Error in image_encode: {str(e)}")
314
- raise
315
-
316
- # 20250508 pftq: for saving prompt to mp4 metadata comments
317
- def set_mp4_comments_imageio_ffmpeg(input_file, comments):
318
- try:
319
- # Get the path to the bundled FFmpeg binary from imageio-ffmpeg
320
- ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
321
-
322
- # Check if input file exists
323
- if not os.path.exists(input_file):
324
- print(f"Error: Input file {input_file} does not exist")
325
- return False
326
-
327
- # Create a temporary file path
328
- temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
329
-
330
- # FFmpeg command using the bundled binary
331
- command = [
332
- ffmpeg_path, # Use imageio-ffmpeg's FFmpeg
333
- '-i', input_file, # input file
334
- '-metadata', f'comment={comments}', # set comment metadata
335
- '-c:v', 'copy', # copy video stream without re-encoding
336
- '-c:a', 'copy', # copy audio stream without re-encoding
337
- '-y', # overwrite output file if it exists
338
- temp_file # temporary output file
339
- ]
340
-
341
- # Run the FFmpeg command
342
- result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
343
-
344
- if result.returncode == 0:
345
- # Replace the original file with the modified one
346
- shutil.move(temp_file, input_file)
347
- print(f"Successfully added comments to {input_file}")
348
- return True
349
- else:
350
- # Clean up temp file if FFmpeg fails
351
- if os.path.exists(temp_file):
352
- os.remove(temp_file)
353
- print(f"Error: FFmpeg failed with message:\n{result.stderr}")
354
- return False
355
-
356
- except Exception as e:
357
- # Clean up temp file in case of other errors
358
- if 'temp_file' in locals() and os.path.exists(temp_file):
359
- os.remove(temp_file)
360
- print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
361
- return False
362
-
363
- # 20250506 pftq: Modified worker to accept video input, and clean frame count
364
- @torch.no_grad()
365
- def worker(input_video, end_frame, end_frame_weight, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
366
-
367
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
368
-
369
- try:
370
- # Clean GPU
371
- if not high_vram:
372
- unload_complete_models(
373
- text_encoder, text_encoder_2, image_encoder, vae, transformer
374
- )
375
-
376
- # Text encoding
377
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
378
-
379
- if not high_vram:
380
- fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
381
- load_model_as_complete(text_encoder_2, target_device=gpu)
382
-
383
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
384
-
385
- if cfg == 1:
386
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
387
- else:
388
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
389
-
390
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
391
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
392
-
393
- # 20250506 pftq: Processing input video instead of image
394
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
395
-
396
- # 20250506 pftq: Encode video
397
- start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels, end_of_input_video_latent, end_of_input_video_image_np = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
398
-
399
- #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
400
-
401
- # CLIP Vision
402
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
403
-
404
- if not high_vram:
405
- load_model_as_complete(image_encoder, target_device=gpu)
406
-
407
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
408
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
409
- start_embedding = image_encoder_last_hidden_state
410
-
411
- end_of_input_video_output = hf_clip_vision_encode(end_of_input_video_image_np, feature_extractor, image_encoder)
412
- end_of_input_video_last_hidden_state = end_of_input_video_output.last_hidden_state
413
- end_of_input_video_embedding = end_of_input_video_last_hidden_state
414
-
415
- # 20250507 pftq: Process end frame if provided
416
- end_latent = None
417
- end_clip_embedding = None
418
- if end_frame is not None:
419
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'End frame encoding ...'))))
420
- end_latent, end_clip_embedding, _ = image_encode(
421
- end_frame, target_width=width, target_height=height, vae=vae,
422
- image_encoder=image_encoder, feature_extractor=feature_extractor, device=gpu
423
- )
424
-
425
- # Dtype
426
- llama_vec = llama_vec.to(transformer.dtype)
427
- llama_vec_n = llama_vec_n.to(transformer.dtype)
428
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
429
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
430
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
431
- end_of_input_video_embedding = end_of_input_video_embedding.to(transformer.dtype)
432
-
433
- # 20250509 pftq: Restored original placement of total_latent_sections after video_encode
434
- total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
435
- total_latent_sections = int(max(round(total_latent_sections), 1))
436
-
437
- for idx in range(batch):
438
- if batch > 1:
439
- print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
440
-
441
- job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepack-videoinput-endframe_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}"
442
-
443
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
444
-
445
- rnd = torch.Generator("cpu").manual_seed(seed)
446
-
447
- history_latents = video_latents.cpu()
448
- history_pixels = None
449
- total_generated_latent_frames = 0
450
- previous_video = None
451
-
452
-
453
- # 20250509 Generate backwards with end frame for better end frame anchoring
454
- if total_latent_sections > 4:
455
- latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
456
- else:
457
- latent_paddings = list(reversed(range(total_latent_sections)))
458
-
459
- for section_index, latent_padding in enumerate(latent_paddings):
460
- is_start_of_video = latent_padding == 0
461
- is_end_of_video = latent_padding == latent_paddings[0]
462
- latent_padding_size = latent_padding * latent_window_size
463
-
464
- if stream.input_queue.top() == 'end':
465
- stream.output_queue.push(('end', None))
466
- return
467
-
468
- if not high_vram:
469
- unload_complete_models()
470
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
471
-
472
- if use_teacache:
473
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
474
- else:
475
- transformer.initialize_teacache(enable_teacache=False)
476
-
477
- def callback(d):
478
- try:
479
- preview = d['denoised']
480
- preview = vae_decode_fake(preview)
481
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
482
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
483
- if stream.input_queue.top() == 'end':
484
- stream.output_queue.push(('end', None))
485
- raise KeyboardInterrupt('User ends the task.')
486
- current_step = d['i'] + 1
487
- percentage = int(100.0 * current_step / steps)
488
- hint = f'Sampling {current_step}/{steps}'
489
- desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Seed: {seed}, Video {idx+1} of {batch}. Generating part {total_latent_sections - section_index} of {total_latent_sections} backward...'
490
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
491
- except ConnectionResetError as e:
492
- print(f"Suppressed ConnectionResetError in callback: {e}")
493
- return
494
-
495
- # 20250509 pftq: Dynamic frame allocation like original num_clean_frames, fix split error
496
- available_frames = video_latents.shape[2] if is_start_of_video else history_latents.shape[2]
497
- if is_start_of_video:
498
- effective_clean_frames = 1 # avoid jumpcuts from input video
499
- else:
500
- effective_clean_frames = max(0, num_clean_frames - 1) if num_clean_frames > 1 else 1
501
- clean_latent_pre_frames = effective_clean_frames
502
- num_2x_frames = min(2, max(1, available_frames - clean_latent_pre_frames - 1)) if available_frames > clean_latent_pre_frames + 1 else 1
503
- num_4x_frames = min(16, max(1, available_frames - clean_latent_pre_frames - num_2x_frames)) if available_frames > clean_latent_pre_frames + num_2x_frames else 1
504
- total_context_frames = num_2x_frames + num_4x_frames
505
- total_context_frames = min(total_context_frames, available_frames - clean_latent_pre_frames)
506
-
507
- # 20250511 pftq: Dynamically adjust post_frames based on clean_latents_post
508
- post_frames = 1 if is_end_of_video and end_latent is not None else effective_clean_frames # 20250511 pftq: Single frame for end_latent, otherwise padding causes still image
509
- indices = torch.arange(0, clean_latent_pre_frames + latent_padding_size + latent_window_size + post_frames + num_2x_frames + num_4x_frames).unsqueeze(0)
510
- clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split(
511
- [clean_latent_pre_frames, latent_padding_size, latent_window_size, post_frames, num_2x_frames, num_4x_frames], dim=1
512
- )
513
- clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
514
-
515
- # 20250509 pftq: Split context frames dynamically for 2x and 4x only
516
- context_frames = history_latents[:, :, -(total_context_frames + clean_latent_pre_frames):-clean_latent_pre_frames, :, :] if total_context_frames > 0 else history_latents[:, :, :1, :, :]
517
- split_sizes = [num_4x_frames, num_2x_frames]
518
- split_sizes = [s for s in split_sizes if s > 0]
519
- if split_sizes and context_frames.shape[2] >= sum(split_sizes):
520
- splits = context_frames.split(split_sizes, dim=2)
521
- split_idx = 0
522
- clean_latents_4x = splits[split_idx] if num_4x_frames > 0 else history_latents[:, :, :1, :, :]
523
- split_idx += 1 if num_4x_frames > 0 else 0
524
- clean_latents_2x = splits[split_idx] if num_2x_frames > 0 and split_idx < len(splits) else history_latents[:, :, :1, :, :]
525
- else:
526
- clean_latents_4x = clean_latents_2x = history_latents[:, :, :1, :, :]
527
-
528
- clean_latents_pre = video_latents[:, :, -min(effective_clean_frames, video_latents.shape[2]):].to(history_latents) # smoother motion but jumpcuts if end frame is too different, must change clean_latent_pre_frames to effective_clean_frames also
529
- clean_latents_post = history_latents[:, :, :min(effective_clean_frames, history_latents.shape[2]), :, :] # smoother motion, must change post_frames to effective_clean_frames also
530
-
531
- if is_end_of_video:
532
- clean_latents_post = torch.zeros_like(end_of_input_video_latent).to(history_latents)
533
-
534
- # 20250509 pftq: handle end frame if available
535
- if end_latent is not None:
536
- #current_end_frame_weight = end_frame_weight * (latent_padding / latent_paddings[0])
537
- #current_end_frame_weight = current_end_frame_weight * 0.5 + 0.5
538
- current_end_frame_weight = end_frame_weight # changing this over time introduces discontinuity
539
- # 20250511 pftq: Removed end frame weight adjustment as it has no effect
540
- image_encoder_last_hidden_state = (1 - current_end_frame_weight) * end_of_input_video_embedding + end_clip_embedding * current_end_frame_weight
541
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
542
-
543
- # 20250511 pftq: Use end_latent only
544
- if is_end_of_video:
545
- clean_latents_post = end_latent.to(history_latents)[:, :, :1, :, :] # Ensure single frame
546
-
547
- # 20250511 pftq: Pad clean_latents_pre to match clean_latent_pre_frames if needed
548
- if clean_latents_pre.shape[2] < clean_latent_pre_frames:
549
- clean_latents_pre = clean_latents_pre.repeat(1, 1, clean_latent_pre_frames // clean_latents_pre.shape[2], 1, 1)
550
- # 20250511 pftq: Pad clean_latents_post to match post_frames if needed
551
- if clean_latents_post.shape[2] < post_frames:
552
- clean_latents_post = clean_latents_post.repeat(1, 1, post_frames // clean_latents_post.shape[2], 1, 1)
553
-
554
- clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
555
-
556
- max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
557
- print(f"Generating video {idx+1} of {batch} with seed {seed}, part {total_latent_sections - section_index} of {total_latent_sections} backward")
558
- generated_latents = sample_hunyuan(
559
- transformer=transformer,
560
- sampler='unipc',
561
- width=width,
562
- height=height,
563
- frames=max_frames,
564
- real_guidance_scale=cfg,
565
- distilled_guidance_scale=gs,
566
- guidance_rescale=rs,
567
- num_inference_steps=steps,
568
- generator=rnd,
569
- prompt_embeds=llama_vec,
570
- prompt_embeds_mask=llama_attention_mask,
571
- prompt_poolers=clip_l_pooler,
572
- negative_prompt_embeds=llama_vec_n,
573
- negative_prompt_embeds_mask=llama_attention_mask_n,
574
- negative_prompt_poolers=clip_l_pooler_n,
575
- device=gpu,
576
- dtype=torch.bfloat16,
577
- image_embeddings=image_encoder_last_hidden_state,
578
- latent_indices=latent_indices,
579
- clean_latents=clean_latents,
580
- clean_latent_indices=clean_latent_indices,
581
- clean_latents_2x=clean_latents_2x,
582
- clean_latent_2x_indices=clean_latent_2x_indices,
583
- clean_latents_4x=clean_latents_4x,
584
- clean_latent_4x_indices=clean_latent_4x_indices,
585
- callback=callback,
586
- )
587
-
588
- if is_start_of_video:
589
- generated_latents = torch.cat([video_latents[:, :, -1:].to(generated_latents), generated_latents], dim=2)
590
-
591
- total_generated_latent_frames += int(generated_latents.shape[2])
592
- history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
593
-
594
- if not high_vram:
595
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
596
- load_model_as_complete(vae, target_device=gpu)
597
-
598
- real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
599
- if history_pixels is None:
600
- history_pixels = vae_decode(real_history_latents, vae).cpu()
601
- else:
602
- section_latent_frames = (latent_window_size * 2 + 1) if is_start_of_video else (latent_window_size * 2)
603
- overlapped_frames = latent_window_size * 4 - 3
604
- current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
605
- history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
606
-
607
- if not high_vram:
608
- unload_complete_models()
609
-
610
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
611
- save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
612
- print(f"Latest video saved: {output_filename}")
613
- set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompt} | Negative Prompt: {n_prompt}")
614
- print(f"Prompt saved to mp4 metadata comments: {output_filename}")
615
-
616
- if previous_video is not None and os.path.exists(previous_video):
617
- try:
618
- os.remove(previous_video)
619
- print(f"Previous partial video deleted: {previous_video}")
620
- except Exception as e:
621
- print(f"Error deleting previous partial video {previous_video}: {e}")
622
- previous_video = output_filename
623
-
624
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
625
- stream.output_queue.push(('file', output_filename))
626
-
627
- if is_start_of_video:
628
- break
629
-
630
- history_pixels = torch.cat([input_video_pixels, history_pixels], dim=2)
631
- #overlapped_frames = latent_window_size * 4 - 3
632
- #history_pixels = soft_append_bcthw(input_video_pixels, history_pixels, overlapped_frames)
633
-
634
- output_filename = os.path.join(outputs_folder, f'{job_id}_final.mp4')
635
- save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
636
- print(f"Final video with input blend saved: {output_filename}")
637
- set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompt} | Negative Prompt: {n_prompt}")
638
- print(f"Prompt saved to mp4 metadata comments: {output_filename}")
639
- stream.output_queue.push(('file', output_filename))
640
-
641
- if previous_video is not None and os.path.exists(previous_video):
642
- try:
643
- os.remove(previous_video)
644
- print(f"Previous partial video deleted: {previous_video}")
645
- except Exception as e:
646
- print(f"Error deleting previous partial video {previous_video}: {e}")
647
- previous_video = output_filename
648
-
649
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
650
-
651
- stream.output_queue.push(('file', output_filename))
652
-
653
- seed = (seed + 1) % np.iinfo(np.int32).max
654
-
655
- except:
656
- traceback.print_exc()
657
-
658
- if not high_vram:
659
- unload_complete_models(
660
- text_encoder, text_encoder_2, image_encoder, vae, transformer
661
- )
662
-
663
- stream.output_queue.push(('end', None))
664
- return
665
-
666
- # 20250506 pftq: Modified process to pass clean frame count, etc
667
- def get_duration(
668
- input_video, end_frame, end_frame_weight, prompt, n_prompt,
669
- randomize_seed,
670
- seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache,
671
- no_resize, mp4_crf, num_clean_frames, vae_batch):
672
- return total_second_length * 60 * 2
673
-
674
- @spaces.GPU(duration=get_duration)
675
- def process(
676
- input_video, end_frame, end_frame_weight, prompt, n_prompt,
677
- randomize_seed,
678
- seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache,
679
- no_resize, mp4_crf, num_clean_frames, vae_batch):
680
- global stream, high_vram
681
-
682
- if torch.cuda.device_count() == 0:
683
- gr.Warning('Set this space to GPU config to make it work.')
684
- return None, None, None, None, None, None
685
-
686
- if randomize_seed:
687
- seed = random.randint(0, np.iinfo(np.int32).max)
688
-
689
- # 20250506 pftq: Updated assertion for video input
690
- assert input_video is not None, 'No input video!'
691
-
692
- yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
693
-
694
- # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
695
- if high_vram and (no_resize or resolution>640):
696
- print("Disabling high vram mode due to no resize and/or potentially higher resolution...")
697
- high_vram = False
698
- vae.enable_slicing()
699
- vae.enable_tiling()
700
- DynamicSwapInstaller.install_model(transformer, device=gpu)
701
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
702
-
703
- # 20250508 pftq: automatically set distilled cfg to 1 if cfg is used
704
- if cfg > 1:
705
- gs = 1
706
-
707
- stream = AsyncStream()
708
-
709
- # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
710
- async_run(worker, input_video, end_frame, end_frame_weight, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
711
-
712
- output_filename = None
713
-
714
- while True:
715
- flag, data = stream.output_queue.next()
716
-
717
- if flag == 'file':
718
- output_filename = data
719
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
720
-
721
- if flag == 'progress':
722
- preview, desc, html = data
723
- #yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
724
- yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
725
-
726
- if flag == 'end':
727
- yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
728
- break
729
-
730
- def end_process():
731
- stream.input_queue.push('end')
732
-
733
- quick_prompts = [
734
- 'The girl dances gracefully, with clear movements, full of charm.',
735
- 'A character doing some simple body movements.',
736
- ]
737
- quick_prompts = [[x] for x in quick_prompts]
738
-
739
- css = make_progress_bar_css()
740
- block = gr.Blocks(css=css).queue(
741
- max_size=10 # 20250507 pftq: Limit queue size
742
- )
743
- with block:
744
- if torch.cuda.device_count() == 0:
745
- with gr.Row():
746
- gr.HTML("""
747
- <p style="background-color: red;"><big><big><big><b>⚠️To use FramePack, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
748
-
749
- You can't use FramePack directly here because this space runs on a CPU, which is not enough for FramePack. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">feedback</a> if you have issues.
750
- </big></big></big></p>
751
- """)
752
- # 20250506 pftq: Updated title to reflect video input functionality
753
- gr.Markdown('# Framepack with Video Input (Video Extension) + End Frame')
754
- with gr.Row():
755
- with gr.Column():
756
-
757
- # 20250506 pftq: Changed to Video input from Image
758
- with gr.Row():
759
- input_video = gr.Video(sources='upload', label="Input Video", height=320)
760
- with gr.Column():
761
- # 20250507 pftq: Added end_frame + weight
762
- end_frame = gr.Image(sources='upload', type="numpy", label="End Frame (Optional) - Reduce context frames if very different from input video or if it is jumpcutting/slowing to still image.", height=320)
763
- end_frame_weight = gr.Slider(label="End Frame Weight", minimum=0.0, maximum=1.0, value=1.0, step=0.01, info='Reduce to treat more as a reference image; no effect')
764
-
765
- prompt = gr.Textbox(label="Prompt", value='')
766
-
767
- with gr.Row():
768
- start_button = gr.Button(value="Start Generation", variant="primary")
769
- end_button = gr.Button(value="End Generation", variant="stop", interactive=False)
770
-
771
- with gr.Accordion("Advanced settings", open=False):
772
- with gr.Row():
773
- use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
774
- no_resize = gr.Checkbox(label='Force Original Video Resolution (No Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
775
-
776
- randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
777
- seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
778
-
779
- batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.')
780
-
781
- resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0)
782
-
783
- total_second_length = gr.Slider(label="Additional Video Length to Generate (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
784
-
785
- # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
786
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames.')
787
- cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use instead of Distilled for more detail/control + Negative Prompt (make sure Distilled=1). Doubles render time.') # Should not change
788
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01) # Should not change
789
-
790
- n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
791
-
792
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Expensive. Increase for more quality, especially if using high non-distilled CFG.')
793
-
794
- # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
795
- num_clean_frames = gr.Slider(label="Number of Context Frames (Adherence to Video)", minimum=2, maximum=10, value=5, step=1, info="Expensive. Retain more video details. Reduce if memory issues or motion too restricted (jumpcut, ignoring prompt, still).")
796
-
797
- default_vae = 32
798
- if high_vram:
799
- default_vae = 128
800
- elif free_mem_gb>=20:
801
- default_vae = 64
802
-
803
- vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Expensive. Increase for better quality frames during fast motion. Reduce if running out of memory")
804
-
805
- latent_window_size = gr.Slider(label="Latent Window Size", minimum=9, maximum=49, value=9, step=1, info='Expensive. Generate more frames at a time (larger chunks). Less degradation but higher VRAM cost.')
806
-
807
- gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
808
-
809
- mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
810
-
811
- with gr.Column():
812
- preview_image = gr.Image(label="Next Latents", height=200, visible=False)
813
- result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
814
- progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
815
- progress_bar = gr.HTML('', elem_classes='no-generating-animation')
816
-
817
- # 20250506 pftq: Updated inputs to include num_clean_frames
818
- ips = [input_video, end_frame, end_frame_weight, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
819
- start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
820
- end_button.click(fn=end_process)
821
-
822
- block.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
img_examples/Example5.png ADDED

Git LFS Details

  • SHA256: b6a7b7521a2ffe77f60a78bb52013c1ef73bfcefbd809f45cfdeef804aee8906
  • Pointer size: 131 Bytes
  • Size of remote file: 431 kB
img_examples/Example6.png ADDED

Git LFS Details

  • SHA256: 59e76d165d9bece1775302a7e4032f31b28545937726d42f41b0c67aae9d4143
  • Pointer size: 131 Bytes
  • Size of remote file: 721 kB
requirements.txt CHANGED
@@ -1,24 +1,42 @@
1
- accelerate==1.7.0
2
- diffusers==0.33.1
3
- transformers==4.52.4
 
 
 
 
4
  sentencepiece==0.2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  pillow==11.2.1
6
- av==12.1.0
7
- numpy==1.26.2
8
- scipy==1.12.0
9
- requests==2.32.4
10
- torchsde==0.2.6
11
- torch>=2.0.0
12
- torchvision
13
- torchaudio
14
- einops
15
- opencv-contrib-python
16
- safetensors
17
- huggingface_hub
18
- decord
19
- imageio_ffmpeg
20
- sageattention==1.0.6
21
- xformers==0.0.29.post3
22
- bitsandbytes==0.46.0
23
  pillow-heif==0.22.0
24
- spaces[security]
 
 
1
+ pydantic==2.10.6 # To avoid the message "No API found" or "Internal server error"
2
+
3
+ fastapi==0.115.13
4
+ gradio_imageslider==0.0.20
5
+ gradio_client==1.10.3
6
+ numpy==1.26.4
7
+ requests==2.32.4
8
  sentencepiece==0.2.0
9
+ tokenizers==0.19.1
10
+ torchvision==0.22.0
11
+ uvicorn==0.34.3
12
+ wandb==0.20.1
13
+ httpx==0.28.1
14
+ transformers==4.43.0
15
+ accelerate==1.8.0
16
+ scikit-learn==1.7.0
17
+ einops==0.8.1
18
+ einops-exts==0.0.4
19
+ timm==1.0.15
20
+ openai-clip==1.0.1
21
+ fsspec==2025.5.1
22
+ kornia==0.8.1
23
+ matplotlib==3.10.3
24
+ ninja==1.11.1.4
25
+ omegaconf==2.3.0
26
+ opencv-python==4.11.0.86
27
+ pandas==2.3.0
28
  pillow==11.2.1
29
+ pytorch-lightning==2.5.1.post0
30
+ PyYAML==6.0.2
31
+ scipy==1.15.3
32
+ tqdm==4.67.1
33
+ triton==3.3.0
34
+ urllib3==2.4.0
35
+ webdataset==0.2.111
36
+ xformers==0.0.30
37
+ facexlib==0.3.0
38
+ k-diffusion==0.1.1.post1
39
+ diffusers==0.33.1
 
 
 
 
 
 
40
  pillow-heif==0.22.0
41
+
42
+ open-clip-torch==2.24.0