Spaces:
Running
Running
Optimization
Browse files
app.py
CHANGED
@@ -357,31 +357,36 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
|
|
357 |
|
358 |
H, W, C = input_image.shape
|
359 |
height, width = find_nearest_bucket(H, W, resolution=resolution)
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
|
383 |
-
|
384 |
-
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
385 |
|
386 |
# Dtype
|
387 |
|
@@ -573,31 +578,36 @@ def worker_last_frame(input_image, prompts, n_prompt, seed, resolution, total_se
|
|
573 |
|
574 |
H, W, C = input_image.shape
|
575 |
height, width = find_nearest_bucket(H, W, resolution=resolution)
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
598 |
|
599 |
-
|
600 |
-
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
601 |
|
602 |
# Dtype
|
603 |
|
|
|
357 |
|
358 |
H, W, C = input_image.shape
|
359 |
height, width = find_nearest_bucket(H, W, resolution=resolution)
|
360 |
+
|
361 |
+
def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
|
362 |
+
input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
|
363 |
+
|
364 |
+
#Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
|
365 |
+
|
366 |
+
input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
|
367 |
+
input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
|
368 |
+
|
369 |
+
# VAE encoding
|
370 |
+
|
371 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
372 |
+
|
373 |
+
if not high_vram:
|
374 |
+
load_model_as_complete(vae, target_device=gpu)
|
375 |
+
|
376 |
+
start_latent = vae_encode(input_image_pt, vae)
|
377 |
+
|
378 |
+
# CLIP Vision
|
379 |
+
|
380 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
381 |
+
|
382 |
+
if not high_vram:
|
383 |
+
load_model_as_complete(image_encoder, target_device=gpu)
|
384 |
+
|
385 |
+
return start_latent
|
386 |
+
|
387 |
+
start_latent = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
|
388 |
|
389 |
+
image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
|
|
|
390 |
|
391 |
# Dtype
|
392 |
|
|
|
578 |
|
579 |
H, W, C = input_image.shape
|
580 |
height, width = find_nearest_bucket(H, W, resolution=resolution)
|
581 |
+
|
582 |
+
def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
|
583 |
+
input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
|
584 |
+
|
585 |
+
#Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
|
586 |
+
|
587 |
+
input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
|
588 |
+
input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
|
589 |
+
|
590 |
+
# VAE encoding
|
591 |
+
|
592 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
593 |
+
|
594 |
+
if not high_vram:
|
595 |
+
load_model_as_complete(vae, target_device=gpu)
|
596 |
+
|
597 |
+
start_latent = vae_encode(input_image_pt, vae)
|
598 |
+
|
599 |
+
# CLIP Vision
|
600 |
+
|
601 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
602 |
+
|
603 |
+
if not high_vram:
|
604 |
+
load_model_as_complete(image_encoder, target_device=gpu)
|
605 |
+
|
606 |
+
return start_latent
|
607 |
+
|
608 |
+
start_latent = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
|
609 |
|
610 |
+
image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
|
|
|
611 |
|
612 |
# Dtype
|
613 |
|