Spaces:
Running
Running
Optimize memory
Browse files
app.py
CHANGED
@@ -468,6 +468,8 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
|
|
468 |
return [start_latent, image_encoder_last_hidden_state]
|
469 |
|
470 |
[start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
|
|
|
|
|
471 |
|
472 |
# Dtype
|
473 |
|
@@ -752,6 +754,8 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
|
|
752 |
return [start_latent, end_latent, image_encoder_last_hidden_state]
|
753 |
|
754 |
[start_latent, end_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram)
|
|
|
|
|
755 |
|
756 |
# Dtype
|
757 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
@@ -766,7 +770,6 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
|
|
766 |
start_latent = start_latent.to(history_latents)
|
767 |
if has_end_image:
|
768 |
end_latent = end_latent.to(history_latents)
|
769 |
-
end_latent = end_latent.expand(-1, -1, 1 + end_stillness, -1, -1)
|
770 |
|
771 |
history_pixels = None
|
772 |
total_generated_latent_frames = 0
|
@@ -859,7 +862,7 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
|
|
859 |
|
860 |
# Use end image latent for the first section if provided
|
861 |
if has_end_image and is_first_section:
|
862 |
-
clean_latents_post = end_latent
|
863 |
|
864 |
clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
|
865 |
|
@@ -946,6 +949,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
946 |
|
947 |
# 20250506 pftq: Encode video
|
948 |
start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
|
|
|
949 |
start_latent = start_latent.to(dtype=torch.float32, device=cpu)
|
950 |
video_latents = video_latents.cpu()
|
951 |
|
@@ -994,6 +998,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
994 |
end_frame, target_width=width, target_height=height, vae=vae,
|
995 |
image_encoder=image_encoder, feature_extractor=feature_extractor, device=gpu
|
996 |
)[:2]
|
|
|
997 |
end_latent = end_latent.to(dtype=torch.float32, device=cpu)
|
998 |
else:
|
999 |
end_latent = end_clip_embedding = None
|
@@ -1775,7 +1780,7 @@ with block:
|
|
1775 |
"./img_examples/Example5.png", # input_image
|
1776 |
"./img_examples/Example6.png", # end_image
|
1777 |
0, # image_position
|
1778 |
-
|
1779 |
"A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k",
|
1780 |
"start_end", # generation_mode
|
1781 |
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
|
|
|
468 |
return [start_latent, image_encoder_last_hidden_state]
|
469 |
|
470 |
[start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
|
471 |
+
input_image = None
|
472 |
+
end_image = None
|
473 |
|
474 |
# Dtype
|
475 |
|
|
|
754 |
return [start_latent, end_latent, image_encoder_last_hidden_state]
|
755 |
|
756 |
[start_latent, end_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram)
|
757 |
+
input_image = None
|
758 |
+
end_image = None
|
759 |
|
760 |
# Dtype
|
761 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
|
|
770 |
start_latent = start_latent.to(history_latents)
|
771 |
if has_end_image:
|
772 |
end_latent = end_latent.to(history_latents)
|
|
|
773 |
|
774 |
history_pixels = None
|
775 |
total_generated_latent_frames = 0
|
|
|
862 |
|
863 |
# Use end image latent for the first section if provided
|
864 |
if has_end_image and is_first_section:
|
865 |
+
clean_latents_post = end_latent.expand(-1, -1, 1 + end_stillness, -1, -1)
|
866 |
|
867 |
clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
|
868 |
|
|
|
949 |
|
950 |
# 20250506 pftq: Encode video
|
951 |
start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
|
952 |
+
input_video = None
|
953 |
start_latent = start_latent.to(dtype=torch.float32, device=cpu)
|
954 |
video_latents = video_latents.cpu()
|
955 |
|
|
|
998 |
end_frame, target_width=width, target_height=height, vae=vae,
|
999 |
image_encoder=image_encoder, feature_extractor=feature_extractor, device=gpu
|
1000 |
)[:2]
|
1001 |
+
end_frame = None
|
1002 |
end_latent = end_latent.to(dtype=torch.float32, device=cpu)
|
1003 |
else:
|
1004 |
end_latent = end_clip_embedding = None
|
|
|
1780 |
"./img_examples/Example5.png", # input_image
|
1781 |
"./img_examples/Example6.png", # end_image
|
1782 |
0, # image_position
|
1783 |
+
1, # end_stillness
|
1784 |
"A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k",
|
1785 |
"start_end", # generation_mode
|
1786 |
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
|