Fabrice-TIERCELIN commited on
Commit
4e6603f
·
verified ·
1 Parent(s): 7154dc2

Optimization

Browse files
Files changed (1) hide show
  1. app.py +58 -48
app.py CHANGED
@@ -357,31 +357,36 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
357
 
358
  H, W, C = input_image.shape
359
  height, width = find_nearest_bucket(H, W, resolution=resolution)
360
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
361
-
362
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
363
-
364
- input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
365
- input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
366
-
367
- # VAE encoding
368
-
369
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
370
-
371
- if not high_vram:
372
- load_model_as_complete(vae, target_device=gpu)
373
-
374
- start_latent = vae_encode(input_image_pt, vae)
375
-
376
- # CLIP Vision
377
-
378
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
379
-
380
- if not high_vram:
381
- load_model_as_complete(image_encoder, target_device=gpu)
 
 
 
 
 
 
382
 
383
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
384
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
385
 
386
  # Dtype
387
 
@@ -573,31 +578,36 @@ def worker_last_frame(input_image, prompts, n_prompt, seed, resolution, total_se
573
 
574
  H, W, C = input_image.shape
575
  height, width = find_nearest_bucket(H, W, resolution=resolution)
576
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
577
-
578
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
579
-
580
- input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
581
- input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
582
-
583
- # VAE encoding
584
-
585
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
586
-
587
- if not high_vram:
588
- load_model_as_complete(vae, target_device=gpu)
589
-
590
- start_latent = vae_encode(input_image_pt, vae)
591
-
592
- # CLIP Vision
593
-
594
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
595
-
596
- if not high_vram:
597
- load_model_as_complete(image_encoder, target_device=gpu)
 
 
 
 
 
 
598
 
599
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
600
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
601
 
602
  # Dtype
603
 
 
357
 
358
  H, W, C = input_image.shape
359
  height, width = find_nearest_bucket(H, W, resolution=resolution)
360
+
361
+ def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
362
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
363
+
364
+ #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
365
+
366
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
367
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
368
+
369
+ # VAE encoding
370
+
371
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
372
+
373
+ if not high_vram:
374
+ load_model_as_complete(vae, target_device=gpu)
375
+
376
+ start_latent = vae_encode(input_image_pt, vae)
377
+
378
+ # CLIP Vision
379
+
380
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
381
+
382
+ if not high_vram:
383
+ load_model_as_complete(image_encoder, target_device=gpu)
384
+
385
+ return start_latent
386
+
387
+ start_latent = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
388
 
389
+ image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
 
390
 
391
  # Dtype
392
 
 
578
 
579
  H, W, C = input_image.shape
580
  height, width = find_nearest_bucket(H, W, resolution=resolution)
581
+
582
+ def get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram):
583
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
584
+
585
+ #Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
586
+
587
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
588
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
589
+
590
+ # VAE encoding
591
+
592
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
593
+
594
+ if not high_vram:
595
+ load_model_as_complete(vae, target_device=gpu)
596
+
597
+ start_latent = vae_encode(input_image_pt, vae)
598
+
599
+ # CLIP Vision
600
+
601
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
602
+
603
+ if not high_vram:
604
+ load_model_as_complete(image_encoder, target_device=gpu)
605
+
606
+ return start_latent
607
+
608
+ start_latent = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
609
 
610
+ image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
 
611
 
612
  # Dtype
613