linoyts HF Staff commited on
Commit
9526215
·
verified ·
1 Parent(s): b268619

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -37
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import torch
2
- from diffusers import AutoencoderKLWan, WanVACEPipeline UniPCMultistepScheduler
3
  from diffusers.utils import export_to_video
4
  from transformers import CLIPVisionModel
5
  import gradio as gr
@@ -41,15 +41,15 @@ SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
41
  SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
42
  MAX_SEED = np.iinfo(np.int32).max
43
 
44
- FIXED_FPS = 24
45
  MIN_FRAMES_MODEL = 8
46
  MAX_FRAMES_MODEL = 81
47
 
48
  # Default prompts for different modes
49
  MODE_PROMPTS = {
50
- "Ref2V": "",
51
- "FLF2V": "",
52
- "Random2V": ""
53
  }
54
 
55
  default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
@@ -96,27 +96,49 @@ def update_prompt_from_mode(mode):
96
  """Update the prompt based on the selected mode"""
97
  return MODE_PROMPTS.get(mode, "")
98
 
99
- def process_images_for_mode(images, mode):
100
- """Process images based on the selected mode"""
101
- if not images or len(images) == 0:
102
- return None
103
-
104
- if mode == "Ref2V":
105
- # Use the first image as reference
106
- return images[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- elif mode == "FLF2V":
109
- # First and Last Frame: blend or interpolate between first and last image
110
- if len(images) >= 2:
111
- return None
112
- else:
113
- return images[0]
114
 
115
- elif mode == "Random2V":
116
- # Randomly select one image from the gallery
117
- return images[0]
 
118
 
119
- return images[0]
120
 
121
  def get_duration(gallery_images, mode, prompt, height, width,
122
  negative_prompt, duration_seconds,
@@ -159,11 +181,10 @@ def generate_video(gallery_images, mode, prompt, height, width,
159
  if gallery_images is None or len(gallery_images) == 0:
160
  raise gr.Error("Please upload at least one image to the gallery.")
161
 
162
- # Process images based on the selected mode
163
- input_image = process_images_for_mode(gallery_images, mode)
164
-
165
- if input_image is None:
166
- raise gr.Error("Failed to process images for the selected mode.")
167
 
168
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
169
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
@@ -172,19 +193,33 @@ def generate_video(gallery_images, mode, prompt, height, width,
172
 
173
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
174
 
175
- resized_image = input_image.resize((target_w, target_h))
176
 
177
- # Mode-specific processing can be added here if needed
178
- if mode == "FLF2V" and len(gallery_images) >= 2:
179
- # You can add special handling for FLF2V mode here
180
- # For example, use both first and last frames in some way
181
- pass
 
 
 
 
 
 
 
 
182
 
183
  with torch.inference_mode():
184
  output_frames_list = pipe(
185
- image=resized_image, prompt=prompt, negative_prompt=negative_prompt,
186
- height=target_h, width=target_w, num_frames=num_frames,
187
- guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
 
 
 
 
 
 
 
188
  generator=torch.Generator(device="cuda").manual_seed(current_seed)
189
  ).frames[0]
190
 
 
1
  import torch
2
+ from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
3
  from diffusers.utils import export_to_video
4
  from transformers import CLIPVisionModel
5
  import gradio as gr
 
41
  SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
42
  MAX_SEED = np.iinfo(np.int32).max
43
 
44
+ FIXED_FPS = 16
45
  MIN_FRAMES_MODEL = 8
46
  MAX_FRAMES_MODEL = 81
47
 
48
  # Default prompts for different modes
49
  MODE_PROMPTS = {
50
+ "Ref2V": "the playful penguin picks up the green cat eye sunglasses and puts them on",
51
+ "FLF2V": "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective.",
52
+ "Random2V": "Various different characters appear and disappear in a fast transition video showcasting their unique features and personalities. The video is about showcasing different dance styles, with each character performing a distinct dance move. The background is a vibrant, colorful stage with dynamic lighting that changes with each dance style. The camera captures close-ups of the dancers' expressions and movements. Highly dynamic, fast-paced music video, with quick cuts and transitions between characters, cinematic, vibrant colors"
53
  }
54
 
55
  default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
 
96
  """Update the prompt based on the selected mode"""
97
  return MODE_PROMPTS.get(mode, "")
98
 
99
+
100
+ def prepare_video_and_mask_Ref2V( height: int, width: int, num_frames: int, img: PIL.Image.Image = None):
101
+ frames = []
102
+ # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
103
+ # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
104
+ # match the original code.
105
+ frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames))
106
+ mask_white = PIL.Image.new("L", (width, height), 255)
107
+ mask = [mask_white] * (num_frames)
108
+ return frames, mask
109
+
110
+ def prepare_video_and_mask_FLF2V(first_img: PIL.Image.Image, last_img: PIL.Image.Image, height: int, width: int, num_frames: int):
111
+ first_img = first_img.resize((width, height))
112
+ last_img = last_img.resize((width, height))
113
+ frames = []
114
+ frames.append(first_img)
115
+ # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
116
+ # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
117
+ # match the original code.
118
+ frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
119
+ frames.append(last_img)
120
+ mask_black = PIL.Image.new("L", (width, height), 0)
121
+ mask_white = PIL.Image.new("L", (width, height), 255)
122
+ mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
123
+ return frames, mask
124
+
125
+ def prepare_video_and_mask_Random2V(images: List[PIL.Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
126
+ images = [img.resize((width, height)) for img in images]
127
+ # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
128
+ # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
129
+ # match the original code.
130
+ frames = [PIL.Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
131
 
132
+ mask_black = PIL.Image.new("L", (width, height), 0)
133
+ mask_white = PIL.Image.new("L", (width, height), 255)
134
+ mask = [mask_white] * num_frames
 
 
 
135
 
136
+ for img, idx in zip(images, frame_indices):
137
+ assert idx < num_frames
138
+ frames[idx] = img
139
+ mask[idx] = mask_black
140
 
141
+ return frames, mask
142
 
143
  def get_duration(gallery_images, mode, prompt, height, width,
144
  negative_prompt, duration_seconds,
 
181
  if gallery_images is None or len(gallery_images) == 0:
182
  raise gr.Error("Please upload at least one image to the gallery.")
183
 
184
+ if mode == "FLF2V" and len(gallery_images) >= 2:
185
+ gallery_images = gallery_images[:2]
186
+ elif mode == "FLF2V" and len(gallery_images) < 2:
187
+ raise gr.Error("only one image was supplied, but 2 are needed for FLF2V")
 
188
 
189
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
190
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
 
193
 
194
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
195
 
 
196
 
197
+ # Process images based on the selected mode
198
+ if mode == "FLF2V":
199
+ frames, mask = prepare_video_and_mask_FLF2V(first_img=gallery_images[0], last_img=gallery_images[1], height=target_h, width=target_w, num_frames=num_frames)
200
+ reference_images=None
201
+ elif mode == "Ref2V":
202
+ frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
203
+ reference_images =
204
+ else: # mode == "":
205
+ frames, mask = prepare_video_and_mask_Random2V(images=gallery_images, frame_indices=[0,15,40], height=target_h, width=target_w, num_frames=num_frames)
206
+ reference_images=None
207
+
208
+ # resized_image = input_image.resize((target_w, target_h))
209
+
210
 
211
  with torch.inference_mode():
212
  output_frames_list = pipe(
213
+ video=frames,
214
+ mask=mask,
215
+ reference_images=reference_images,
216
+ prompt=prompt,
217
+ negative_prompt=negative_prompt,
218
+ height=target_h,
219
+ width=target_w,
220
+ num_frames=num_frames,
221
+ guidance_scale=float(guidance_scale),
222
+ num_inference_steps=int(steps),
223
  generator=torch.Generator(device="cuda").manual_seed(current_seed)
224
  ).frames[0]
225