Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import torch
|
2 |
-
from diffusers import AutoencoderKLWan, WanVACEPipeline UniPCMultistepScheduler
|
3 |
from diffusers.utils import export_to_video
|
4 |
from transformers import CLIPVisionModel
|
5 |
import gradio as gr
|
@@ -41,15 +41,15 @@ SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
|
|
41 |
SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
|
42 |
MAX_SEED = np.iinfo(np.int32).max
|
43 |
|
44 |
-
FIXED_FPS =
|
45 |
MIN_FRAMES_MODEL = 8
|
46 |
MAX_FRAMES_MODEL = 81
|
47 |
|
48 |
# Default prompts for different modes
|
49 |
MODE_PROMPTS = {
|
50 |
-
"Ref2V": "",
|
51 |
-
"FLF2V": "",
|
52 |
-
"Random2V": ""
|
53 |
}
|
54 |
|
55 |
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
|
@@ -96,27 +96,49 @@ def update_prompt_from_mode(mode):
|
|
96 |
"""Update the prompt based on the selected mode"""
|
97 |
return MODE_PROMPTS.get(mode, "")
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
return None
|
112 |
-
else:
|
113 |
-
return images[0]
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
|
119 |
-
return
|
120 |
|
121 |
def get_duration(gallery_images, mode, prompt, height, width,
|
122 |
negative_prompt, duration_seconds,
|
@@ -159,11 +181,10 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
159 |
if gallery_images is None or len(gallery_images) == 0:
|
160 |
raise gr.Error("Please upload at least one image to the gallery.")
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
raise gr.Error("Failed to process images for the selected mode.")
|
167 |
|
168 |
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
|
169 |
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
|
@@ -172,19 +193,33 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
172 |
|
173 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
174 |
|
175 |
-
resized_image = input_image.resize((target_w, target_h))
|
176 |
|
177 |
-
#
|
178 |
-
if mode == "FLF2V"
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
with torch.inference_mode():
|
184 |
output_frames_list = pipe(
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
generator=torch.Generator(device="cuda").manual_seed(current_seed)
|
189 |
).frames[0]
|
190 |
|
|
|
1 |
import torch
|
2 |
+
from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
|
3 |
from diffusers.utils import export_to_video
|
4 |
from transformers import CLIPVisionModel
|
5 |
import gradio as gr
|
|
|
41 |
SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
|
42 |
MAX_SEED = np.iinfo(np.int32).max
|
43 |
|
44 |
+
FIXED_FPS = 16
|
45 |
MIN_FRAMES_MODEL = 8
|
46 |
MAX_FRAMES_MODEL = 81
|
47 |
|
48 |
# Default prompts for different modes
|
49 |
MODE_PROMPTS = {
|
50 |
+
"Ref2V": "the playful penguin picks up the green cat eye sunglasses and puts them on",
|
51 |
+
"FLF2V": "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective.",
|
52 |
+
"Random2V": "Various different characters appear and disappear in a fast transition video showcasting their unique features and personalities. The video is about showcasing different dance styles, with each character performing a distinct dance move. The background is a vibrant, colorful stage with dynamic lighting that changes with each dance style. The camera captures close-ups of the dancers' expressions and movements. Highly dynamic, fast-paced music video, with quick cuts and transitions between characters, cinematic, vibrant colors"
|
53 |
}
|
54 |
|
55 |
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
|
|
|
96 |
"""Update the prompt based on the selected mode"""
|
97 |
return MODE_PROMPTS.get(mode, "")
|
98 |
|
99 |
+
|
100 |
+
def prepare_video_and_mask_Ref2V( height: int, width: int, num_frames: int, img: PIL.Image.Image = None):
|
101 |
+
frames = []
|
102 |
+
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
103 |
+
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
104 |
+
# match the original code.
|
105 |
+
frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames))
|
106 |
+
mask_white = PIL.Image.new("L", (width, height), 255)
|
107 |
+
mask = [mask_white] * (num_frames)
|
108 |
+
return frames, mask
|
109 |
+
|
110 |
+
def prepare_video_and_mask_FLF2V(first_img: PIL.Image.Image, last_img: PIL.Image.Image, height: int, width: int, num_frames: int):
|
111 |
+
first_img = first_img.resize((width, height))
|
112 |
+
last_img = last_img.resize((width, height))
|
113 |
+
frames = []
|
114 |
+
frames.append(first_img)
|
115 |
+
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
116 |
+
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
117 |
+
# match the original code.
|
118 |
+
frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
|
119 |
+
frames.append(last_img)
|
120 |
+
mask_black = PIL.Image.new("L", (width, height), 0)
|
121 |
+
mask_white = PIL.Image.new("L", (width, height), 255)
|
122 |
+
mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
|
123 |
+
return frames, mask
|
124 |
+
|
125 |
+
def prepare_video_and_mask_Random2V(images: List[PIL.Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
|
126 |
+
images = [img.resize((width, height)) for img in images]
|
127 |
+
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
128 |
+
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
129 |
+
# match the original code.
|
130 |
+
frames = [PIL.Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
|
131 |
|
132 |
+
mask_black = PIL.Image.new("L", (width, height), 0)
|
133 |
+
mask_white = PIL.Image.new("L", (width, height), 255)
|
134 |
+
mask = [mask_white] * num_frames
|
|
|
|
|
|
|
135 |
|
136 |
+
for img, idx in zip(images, frame_indices):
|
137 |
+
assert idx < num_frames
|
138 |
+
frames[idx] = img
|
139 |
+
mask[idx] = mask_black
|
140 |
|
141 |
+
return frames, mask
|
142 |
|
143 |
def get_duration(gallery_images, mode, prompt, height, width,
|
144 |
negative_prompt, duration_seconds,
|
|
|
181 |
if gallery_images is None or len(gallery_images) == 0:
|
182 |
raise gr.Error("Please upload at least one image to the gallery.")
|
183 |
|
184 |
+
if mode == "FLF2V" and len(gallery_images) >= 2:
|
185 |
+
gallery_images = gallery_images[:2]
|
186 |
+
elif mode == "FLF2V" and len(gallery_images) < 2:
|
187 |
+
raise gr.Error("only one image was supplied, but 2 are needed for FLF2V")
|
|
|
188 |
|
189 |
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
|
190 |
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
|
|
|
193 |
|
194 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
195 |
|
|
|
196 |
|
197 |
+
# Process images based on the selected mode
|
198 |
+
if mode == "FLF2V":
|
199 |
+
frames, mask = prepare_video_and_mask_FLF2V(first_img=gallery_images[0], last_img=gallery_images[1], height=target_h, width=target_w, num_frames=num_frames)
|
200 |
+
reference_images=None
|
201 |
+
elif mode == "Ref2V":
|
202 |
+
frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
|
203 |
+
reference_images =
|
204 |
+
else: # mode == "":
|
205 |
+
frames, mask = prepare_video_and_mask_Random2V(images=gallery_images, frame_indices=[0,15,40], height=target_h, width=target_w, num_frames=num_frames)
|
206 |
+
reference_images=None
|
207 |
+
|
208 |
+
# resized_image = input_image.resize((target_w, target_h))
|
209 |
+
|
210 |
|
211 |
with torch.inference_mode():
|
212 |
output_frames_list = pipe(
|
213 |
+
video=frames,
|
214 |
+
mask=mask,
|
215 |
+
reference_images=reference_images,
|
216 |
+
prompt=prompt,
|
217 |
+
negative_prompt=negative_prompt,
|
218 |
+
height=target_h,
|
219 |
+
width=target_w,
|
220 |
+
num_frames=num_frames,
|
221 |
+
guidance_scale=float(guidance_scale),
|
222 |
+
num_inference_steps=int(steps),
|
223 |
generator=torch.Generator(device="cuda").manual_seed(current_seed)
|
224 |
).frames[0]
|
225 |
|