Spaces:
Running
on
Zero
Running
on
Zero
Update tqdm descriptions
Browse files- addit_flux_pipeline.py +159 -164
- addit_methods.py +8 -2
- app.py +7 -7
addit_flux_pipeline.py
CHANGED
|
@@ -17,6 +17,7 @@
|
|
| 17 |
# This work is licensed under the LICENSE file
|
| 18 |
# located at the root directory.
|
| 19 |
|
|
|
|
| 20 |
from typing import Any, Callable, Dict, List, Optional, Union
|
| 21 |
import torch
|
| 22 |
import numpy as np
|
|
@@ -175,6 +176,9 @@ class AdditFluxPipeline(FluxPipeline):
|
|
| 175 |
is_img_src: bool = False,
|
| 176 |
use_offset: bool = False,
|
| 177 |
img_src_latents: Optional[List[torch.FloatTensor]] = None,
|
|
|
|
|
|
|
|
|
|
| 178 |
):
|
| 179 |
r"""
|
| 180 |
Function invoked when calling the pipeline for generation.
|
|
@@ -401,51 +405,25 @@ class AdditFluxPipeline(FluxPipeline):
|
|
| 401 |
img_src_latents.append((1.0 - sigma) * source_latents[0] + sigma * rand_noise)
|
| 402 |
|
| 403 |
# 6. Denoising loop
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
continue
|
| 408 |
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
# For denoising from source image
|
| 413 |
-
if is_img_src:
|
| 414 |
-
latents[0] = img_src_latents[i]
|
| 415 |
-
|
| 416 |
-
# For Structure Transfer
|
| 417 |
-
if (source_latents is not None) and i == structure_transfer_step:
|
| 418 |
-
sigma = self.scheduler.sigmas[i]
|
| 419 |
-
latents[1] = (1.0 - sigma) * source_latents[0] + sigma * noise[1]
|
| 420 |
-
|
| 421 |
-
if is_auto_extend_scale and i == auto_extended_step:
|
| 422 |
-
def f(gamma):
|
| 423 |
-
self.attention_store.attention_ratios[i] = {}
|
| 424 |
-
noise_pred = self.transformer(
|
| 425 |
-
hidden_states=latents,
|
| 426 |
-
timestep=timestep / 1000,
|
| 427 |
-
guidance=guidance,
|
| 428 |
-
pooled_projections=pooled_prompt_embeds,
|
| 429 |
-
encoder_hidden_states=prompt_embeds,
|
| 430 |
-
txt_ids=text_ids,
|
| 431 |
-
img_ids=latent_image_ids,
|
| 432 |
-
joint_attention_kwargs=self.joint_attention_kwargs,
|
| 433 |
-
return_dict=False,
|
| 434 |
-
proccesor_kwargs={"step_index": i, "extended_scale": gamma},
|
| 435 |
-
)[0]
|
| 436 |
-
|
| 437 |
-
scores_per_layer = self.attention_store.get_attention_ratios(step_indices=[i], display_imgs=False)
|
| 438 |
-
source_sum, text_sum, target_sum = scores_per_layer['transformer_blocks']
|
| 439 |
-
|
| 440 |
-
# We want to find the gamma that makes the ratio equal to K
|
| 441 |
-
ratio = (target_sum / source_sum)
|
| 442 |
-
return (ratio - target_auto_ratio)
|
| 443 |
-
|
| 444 |
-
gamma_sol = brentq(f, 1.0, 1.2, xtol=0.01)
|
| 445 |
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
noise_pred = self.transformer(
|
| 450 |
hidden_states=latents,
|
| 451 |
timestep=timestep / 1000,
|
|
@@ -456,47 +434,68 @@ class AdditFluxPipeline(FluxPipeline):
|
|
| 456 |
img_ids=latent_image_ids,
|
| 457 |
joint_attention_kwargs=self.joint_attention_kwargs,
|
| 458 |
return_dict=False,
|
| 459 |
-
proccesor_kwargs={"step_index": i, "extended_scale":
|
| 460 |
-
)[0]
|
| 461 |
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
latents, x0 = self.scheduler.step(noise_pred, t, latents, return_dict=False, step_index=i)
|
| 465 |
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
|
|
|
| 470 |
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
|
| 478 |
-
|
|
|
|
|
|
|
| 479 |
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
|
|
|
| 490 |
|
| 491 |
-
|
| 492 |
-
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
| 493 |
|
| 494 |
-
|
| 495 |
-
if
|
| 496 |
-
|
|
|
|
| 497 |
|
| 498 |
-
|
| 499 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
|
| 501 |
if output_type == "latent":
|
| 502 |
image = latents
|
|
@@ -793,6 +792,9 @@ class AdditFluxPipeline(FluxPipeline):
|
|
| 793 |
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
| 794 |
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
| 795 |
max_sequence_length: int = 512,
|
|
|
|
|
|
|
|
|
|
| 796 |
):
|
| 797 |
r"""
|
| 798 |
Function invoked when calling the pipeline for generation.
|
|
@@ -987,49 +989,44 @@ class AdditFluxPipeline(FluxPipeline):
|
|
| 987 |
latent_image_ids = latent_image_ids.expand(latents.shape[0], -1, -1)
|
| 988 |
|
| 989 |
# 6. Denoising loop
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
if
|
| 1014 |
-
|
| 1015 |
-
|
| 1016 |
-
|
| 1017 |
-
|
| 1018 |
-
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
-
|
| 1022 |
-
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
|
| 1027 |
-
|
| 1028 |
-
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
| 1029 |
-
progress_bar.update()
|
| 1030 |
-
|
| 1031 |
-
# if XLA_AVAILABLE:
|
| 1032 |
-
# xm.mark_step()
|
| 1033 |
|
| 1034 |
if output_type == "latent":
|
| 1035 |
image = latents
|
|
@@ -1126,6 +1123,9 @@ class AdditFluxPipeline(FluxPipeline):
|
|
| 1126 |
max_sequence_length: int = 512,
|
| 1127 |
|
| 1128 |
fixed_point_iterations: int = 1,
|
|
|
|
|
|
|
|
|
|
| 1129 |
):
|
| 1130 |
r"""
|
| 1131 |
Function invoked when calling the pipeline for generation.
|
|
@@ -1328,60 +1328,55 @@ class AdditFluxPipeline(FluxPipeline):
|
|
| 1328 |
latents_list.append(latents)
|
| 1329 |
|
| 1330 |
# 6. Denoising loop
|
| 1331 |
-
|
| 1332 |
-
|
| 1333 |
-
|
| 1334 |
-
|
| 1335 |
-
|
| 1336 |
-
continue
|
| 1337 |
-
|
| 1338 |
-
if j == 0:
|
| 1339 |
-
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
| 1340 |
-
timestep = timesteps[i].expand(latents.shape[0]).to(latents.dtype)
|
| 1341 |
-
else:
|
| 1342 |
-
timestep = timesteps_one_start[i].expand(latents.shape[0]).to(latents.dtype)
|
| 1343 |
|
| 1344 |
-
|
| 1345 |
-
|
| 1346 |
-
|
| 1347 |
-
|
| 1348 |
-
|
| 1349 |
-
pooled_projections=pooled_prompt_embeds,
|
| 1350 |
-
encoder_hidden_states=prompt_embeds,
|
| 1351 |
-
txt_ids=text_ids,
|
| 1352 |
-
img_ids=latent_image_ids,
|
| 1353 |
-
joint_attention_kwargs=self.joint_attention_kwargs,
|
| 1354 |
-
return_dict=False,
|
| 1355 |
-
)[0]
|
| 1356 |
|
| 1357 |
-
|
| 1358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1359 |
|
| 1360 |
-
|
| 1361 |
-
|
| 1362 |
|
| 1363 |
-
|
| 1364 |
-
|
| 1365 |
-
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
| 1366 |
-
latents = latents.to(latents_dtype)
|
| 1367 |
|
| 1368 |
-
|
| 1369 |
-
|
| 1370 |
-
|
| 1371 |
-
|
| 1372 |
-
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
| 1373 |
|
| 1374 |
-
|
| 1375 |
-
|
|
|
|
|
|
|
|
|
|
| 1376 |
|
| 1377 |
-
|
| 1378 |
-
|
| 1379 |
-
progress_bar.update()
|
| 1380 |
|
| 1381 |
-
|
| 1382 |
-
|
| 1383 |
-
|
| 1384 |
-
|
| 1385 |
|
| 1386 |
# Offload all models
|
| 1387 |
self.maybe_free_model_hooks()
|
|
|
|
| 17 |
# This work is licensed under the LICENSE file
|
| 18 |
# located at the root directory.
|
| 19 |
|
| 20 |
+
from tqdm import tqdm
|
| 21 |
from typing import Any, Callable, Dict, List, Optional, Union
|
| 22 |
import torch
|
| 23 |
import numpy as np
|
|
|
|
| 176 |
is_img_src: bool = False,
|
| 177 |
use_offset: bool = False,
|
| 178 |
img_src_latents: Optional[List[torch.FloatTensor]] = None,
|
| 179 |
+
|
| 180 |
+
# TQDM
|
| 181 |
+
tqdm_desc: str = "Denoising",
|
| 182 |
):
|
| 183 |
r"""
|
| 184 |
Function invoked when calling the pipeline for generation.
|
|
|
|
| 405 |
img_src_latents.append((1.0 - sigma) * source_latents[0] + sigma * rand_noise)
|
| 406 |
|
| 407 |
# 6. Denoising loop
|
| 408 |
+
for i, t in enumerate(tqdm(timesteps, desc=tqdm_desc)):
|
| 409 |
+
if self.interrupt:
|
| 410 |
+
continue
|
|
|
|
| 411 |
|
| 412 |
+
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
| 413 |
+
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
|
| 415 |
+
# For denoising from source image
|
| 416 |
+
if is_img_src:
|
| 417 |
+
latents[0] = img_src_latents[i]
|
| 418 |
+
|
| 419 |
+
# For Structure Transfer
|
| 420 |
+
if (source_latents is not None) and i == structure_transfer_step:
|
| 421 |
+
sigma = self.scheduler.sigmas[i]
|
| 422 |
+
latents[1] = (1.0 - sigma) * source_latents[0] + sigma * noise[1]
|
| 423 |
+
|
| 424 |
+
if is_auto_extend_scale and i == auto_extended_step:
|
| 425 |
+
def f(gamma):
|
| 426 |
+
self.attention_store.attention_ratios[i] = {}
|
| 427 |
noise_pred = self.transformer(
|
| 428 |
hidden_states=latents,
|
| 429 |
timestep=timestep / 1000,
|
|
|
|
| 434 |
img_ids=latent_image_ids,
|
| 435 |
joint_attention_kwargs=self.joint_attention_kwargs,
|
| 436 |
return_dict=False,
|
| 437 |
+
proccesor_kwargs={"step_index": i, "extended_scale": gamma},
|
| 438 |
+
)[0]
|
| 439 |
|
| 440 |
+
scores_per_layer = self.attention_store.get_attention_ratios(step_indices=[i], display_imgs=False)
|
| 441 |
+
source_sum, text_sum, target_sum = scores_per_layer['transformer_blocks']
|
|
|
|
| 442 |
|
| 443 |
+
# We want to find the gamma that makes the ratio equal to K
|
| 444 |
+
ratio = (target_sum / source_sum)
|
| 445 |
+
return (ratio - target_auto_ratio)
|
| 446 |
+
|
| 447 |
+
gamma_sol = brentq(f, 1.0, 1.2, xtol=0.01)
|
| 448 |
|
| 449 |
+
print('Chosen gamma:', gamma_sol)
|
| 450 |
+
extended_scale = gamma_sol
|
| 451 |
+
else:
|
| 452 |
+
noise_pred = self.transformer(
|
| 453 |
+
hidden_states=latents,
|
| 454 |
+
timestep=timestep / 1000,
|
| 455 |
+
guidance=guidance,
|
| 456 |
+
pooled_projections=pooled_prompt_embeds,
|
| 457 |
+
encoder_hidden_states=prompt_embeds,
|
| 458 |
+
txt_ids=text_ids,
|
| 459 |
+
img_ids=latent_image_ids,
|
| 460 |
+
joint_attention_kwargs=self.joint_attention_kwargs,
|
| 461 |
+
return_dict=False,
|
| 462 |
+
proccesor_kwargs={"step_index": i, "extended_scale": extended_scale},
|
| 463 |
+
)[0]
|
| 464 |
|
| 465 |
+
# compute the previous noisy sample x_t -> x_t-1
|
| 466 |
+
latents_dtype = latents.dtype
|
| 467 |
+
latents, x0 = self.scheduler.step(noise_pred, t, latents, return_dict=False, step_index=i)
|
| 468 |
|
| 469 |
+
if use_offset and is_img_src and (i+1 < len(img_src_latents)):
|
| 470 |
+
next_latent = img_src_latents[i+1]
|
| 471 |
+
offset = (next_latent - latents[0])
|
| 472 |
+
latents[1] = latents[1] + offset
|
| 473 |
|
| 474 |
+
# blend latents
|
| 475 |
+
if i in blend_steps and (subject_token is not None) and (localization_model is not None):
|
| 476 |
+
x0 = self._unpack_latents(x0, height, width, self.vae_scale_factor)
|
| 477 |
+
x0 = (x0 / self.vae.config.scaling_factor) + self.vae.config.shift_factor
|
| 478 |
+
images = self.vae.decode(x0, return_dict=False)[0]
|
| 479 |
+
images = self.image_processor.postprocess(images, output_type="pil")
|
| 480 |
|
| 481 |
+
self.do_step_blend(images, latents, subject_token, localization_model, show_attention, i, blend_models)
|
|
|
|
| 482 |
|
| 483 |
+
if latents.dtype != latents_dtype:
|
| 484 |
+
if torch.backends.mps.is_available():
|
| 485 |
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
| 486 |
+
latents = latents.to(latents_dtype)
|
| 487 |
|
| 488 |
+
if callback_on_step_end is not None:
|
| 489 |
+
callback_kwargs = {}
|
| 490 |
+
for k in callback_on_step_end_tensor_inputs:
|
| 491 |
+
callback_kwargs[k] = locals()[k]
|
| 492 |
+
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
| 493 |
+
|
| 494 |
+
latents = callback_outputs.pop("latents", latents)
|
| 495 |
+
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
| 496 |
+
|
| 497 |
+
# if XLA_AVAILABLE:
|
| 498 |
+
# xm.mark_step()
|
| 499 |
|
| 500 |
if output_type == "latent":
|
| 501 |
image = latents
|
|
|
|
| 792 |
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
| 793 |
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
| 794 |
max_sequence_length: int = 512,
|
| 795 |
+
|
| 796 |
+
# TQDM
|
| 797 |
+
tqdm_desc: str = "Denoising",
|
| 798 |
):
|
| 799 |
r"""
|
| 800 |
Function invoked when calling the pipeline for generation.
|
|
|
|
| 989 |
latent_image_ids = latent_image_ids.expand(latents.shape[0], -1, -1)
|
| 990 |
|
| 991 |
# 6. Denoising loop
|
| 992 |
+
for i, t in enumerate(tqdm(timesteps, desc=tqdm_desc)):
|
| 993 |
+
if self.interrupt:
|
| 994 |
+
continue
|
| 995 |
+
|
| 996 |
+
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
| 997 |
+
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
| 998 |
+
noise_pred = self.transformer(
|
| 999 |
+
hidden_states=latents,
|
| 1000 |
+
timestep=timestep / 1000,
|
| 1001 |
+
guidance=guidance,
|
| 1002 |
+
pooled_projections=pooled_prompt_embeds,
|
| 1003 |
+
encoder_hidden_states=prompt_embeds,
|
| 1004 |
+
txt_ids=text_ids,
|
| 1005 |
+
img_ids=latent_image_ids,
|
| 1006 |
+
joint_attention_kwargs=self.joint_attention_kwargs,
|
| 1007 |
+
return_dict=False,
|
| 1008 |
+
)[0]
|
| 1009 |
+
|
| 1010 |
+
# compute the previous noisy sample x_t -> x_t-1
|
| 1011 |
+
latents_dtype = latents.dtype
|
| 1012 |
+
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
| 1013 |
+
|
| 1014 |
+
if latents.dtype != latents_dtype:
|
| 1015 |
+
if torch.backends.mps.is_available():
|
| 1016 |
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
| 1017 |
+
latents = latents.to(latents_dtype)
|
| 1018 |
+
|
| 1019 |
+
if callback_on_step_end is not None:
|
| 1020 |
+
callback_kwargs = {}
|
| 1021 |
+
for k in callback_on_step_end_tensor_inputs:
|
| 1022 |
+
callback_kwargs[k] = locals()[k]
|
| 1023 |
+
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
| 1024 |
+
|
| 1025 |
+
latents = callback_outputs.pop("latents", latents)
|
| 1026 |
+
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
| 1027 |
+
|
| 1028 |
+
# if XLA_AVAILABLE:
|
| 1029 |
+
# xm.mark_step()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1030 |
|
| 1031 |
if output_type == "latent":
|
| 1032 |
image = latents
|
|
|
|
| 1123 |
max_sequence_length: int = 512,
|
| 1124 |
|
| 1125 |
fixed_point_iterations: int = 1,
|
| 1126 |
+
|
| 1127 |
+
# TQDM
|
| 1128 |
+
tqdm_desc: str = "Denoising",
|
| 1129 |
):
|
| 1130 |
r"""
|
| 1131 |
Function invoked when calling the pipeline for generation.
|
|
|
|
| 1328 |
latents_list.append(latents)
|
| 1329 |
|
| 1330 |
# 6. Denoising loop
|
| 1331 |
+
for i, t in enumerate(tqdm(timesteps, desc=tqdm_desc)):
|
| 1332 |
+
original_latents = latents.clone()
|
| 1333 |
+
for j in range(fixed_point_iterations):
|
| 1334 |
+
if self.interrupt:
|
| 1335 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1336 |
|
| 1337 |
+
if j == 0:
|
| 1338 |
+
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
| 1339 |
+
timestep = timesteps[i].expand(latents.shape[0]).to(latents.dtype)
|
| 1340 |
+
else:
|
| 1341 |
+
timestep = timesteps_one_start[i].expand(latents.shape[0]).to(latents.dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1342 |
|
| 1343 |
+
noise_pred = self.transformer(
|
| 1344 |
+
hidden_states=latents,
|
| 1345 |
+
# YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
|
| 1346 |
+
timestep=timestep / 1000,
|
| 1347 |
+
guidance=guidance,
|
| 1348 |
+
pooled_projections=pooled_prompt_embeds,
|
| 1349 |
+
encoder_hidden_states=prompt_embeds,
|
| 1350 |
+
txt_ids=text_ids,
|
| 1351 |
+
img_ids=latent_image_ids,
|
| 1352 |
+
joint_attention_kwargs=self.joint_attention_kwargs,
|
| 1353 |
+
return_dict=False,
|
| 1354 |
+
)[0]
|
| 1355 |
|
| 1356 |
+
# compute the previous noisy sample x_t -> x_t-1
|
| 1357 |
+
latents_dtype = latents.dtype
|
| 1358 |
|
| 1359 |
+
# noise_pred = -noise_pred
|
| 1360 |
+
latents = self.scheduler.step(noise_pred, t, original_latents, return_dict=False, step_index=i)[0]
|
|
|
|
|
|
|
| 1361 |
|
| 1362 |
+
if latents.dtype != latents_dtype:
|
| 1363 |
+
if torch.backends.mps.is_available():
|
| 1364 |
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
| 1365 |
+
latents = latents.to(latents_dtype)
|
|
|
|
| 1366 |
|
| 1367 |
+
if callback_on_step_end is not None:
|
| 1368 |
+
callback_kwargs = {}
|
| 1369 |
+
for k in callback_on_step_end_tensor_inputs:
|
| 1370 |
+
callback_kwargs[k] = locals()[k]
|
| 1371 |
+
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
| 1372 |
|
| 1373 |
+
latents = callback_outputs.pop("latents", latents)
|
| 1374 |
+
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
|
|
|
| 1375 |
|
| 1376 |
+
# if XLA_AVAILABLE:
|
| 1377 |
+
# xm.mark_step()
|
| 1378 |
+
|
| 1379 |
+
latents_list.append(latents)
|
| 1380 |
|
| 1381 |
# Offload all models
|
| 1382 |
self.maybe_free_model_hooks()
|
addit_methods.py
CHANGED
|
@@ -55,6 +55,9 @@ def _add_object(
|
|
| 55 |
is_img_src=is_img_src,
|
| 56 |
img_src_latents=img_src_latents,
|
| 57 |
use_offset=use_offset,
|
|
|
|
|
|
|
|
|
|
| 58 |
)
|
| 59 |
|
| 60 |
if display_output:
|
|
@@ -90,6 +93,7 @@ def add_object_generated(
|
|
| 90 |
num_inference_steps=30,
|
| 91 |
seed=[seed_src],
|
| 92 |
output_type="both",
|
|
|
|
| 93 |
)
|
| 94 |
source_image = source_image[0]
|
| 95 |
|
|
@@ -141,7 +145,8 @@ def add_object_real(
|
|
| 141 |
strength=0.1,
|
| 142 |
guidance_scale=3.5,
|
| 143 |
output_type="latent",
|
| 144 |
-
generator=torch.Generator(device=pipe.device).manual_seed(0)
|
|
|
|
| 145 |
).images
|
| 146 |
|
| 147 |
# Optional inversion step
|
|
@@ -157,7 +162,8 @@ def add_object_real(
|
|
| 157 |
num_inference_steps=30,
|
| 158 |
guidance_scale=1,
|
| 159 |
fixed_point_iterations=2,
|
| 160 |
-
generator=torch.Generator(device=pipe.device).manual_seed(0)
|
|
|
|
| 161 |
)
|
| 162 |
img_src_latents = [x[0] for x in latents_list][::-1]
|
| 163 |
|
|
|
|
| 55 |
is_img_src=is_img_src,
|
| 56 |
img_src_latents=img_src_latents,
|
| 57 |
use_offset=use_offset,
|
| 58 |
+
|
| 59 |
+
# TQDM
|
| 60 |
+
tqdm_desc="Running Addit: Generating Edited Image",
|
| 61 |
)
|
| 62 |
|
| 63 |
if display_output:
|
|
|
|
| 93 |
num_inference_steps=30,
|
| 94 |
seed=[seed_src],
|
| 95 |
output_type="both",
|
| 96 |
+
tqdm_desc="Generating Source Image",
|
| 97 |
)
|
| 98 |
source_image = source_image[0]
|
| 99 |
|
|
|
|
| 145 |
strength=0.1,
|
| 146 |
guidance_scale=3.5,
|
| 147 |
output_type="latent",
|
| 148 |
+
generator=torch.Generator(device=pipe.device).manual_seed(0),
|
| 149 |
+
tqdm_desc="Encoding Source Image",
|
| 150 |
).images
|
| 151 |
|
| 152 |
# Optional inversion step
|
|
|
|
| 162 |
num_inference_steps=30,
|
| 163 |
guidance_scale=1,
|
| 164 |
fixed_point_iterations=2,
|
| 165 |
+
generator=torch.Generator(device=pipe.device).manual_seed(0),
|
| 166 |
+
tqdm_desc="Inverting Source Image",
|
| 167 |
)
|
| 168 |
img_src_latents = [x[0] for x in latents_list][::-1]
|
| 169 |
|
app.py
CHANGED
|
@@ -216,8 +216,8 @@ def create_interface():
|
|
| 216 |
)
|
| 217 |
gen_prompt_target = gr.Textbox(
|
| 218 |
label="Target Prompt",
|
| 219 |
-
placeholder="A photo of a cat wearing a
|
| 220 |
-
value="A photo of a cat wearing a
|
| 221 |
)
|
| 222 |
gen_subject_token = gr.Textbox(
|
| 223 |
label="Subject Token",
|
|
@@ -227,8 +227,8 @@ def create_interface():
|
|
| 227 |
)
|
| 228 |
|
| 229 |
with gr.Accordion("Advanced Settings", open=False):
|
| 230 |
-
gen_seed_src = gr.Number(label="Source Seed", value=
|
| 231 |
-
gen_seed_obj = gr.Number(label="Object Seed", value=
|
| 232 |
gen_extended_scale = gr.Slider(
|
| 233 |
label="Extended Scale",
|
| 234 |
minimum=1.0,
|
|
@@ -283,7 +283,7 @@ def create_interface():
|
|
| 283 |
gr.Examples(
|
| 284 |
examples=[
|
| 285 |
["A photo of a man sitting on a bench", "A photo of a man sitting on a bench with a dog", "dog"],
|
| 286 |
-
["A photo of a cat sitting on the couch", "A photo of a cat wearing a
|
| 287 |
["A car driving through an empty street", "A pink car driving through an empty street", "car"]
|
| 288 |
],
|
| 289 |
inputs=[
|
|
@@ -317,8 +317,8 @@ def create_interface():
|
|
| 317 |
)
|
| 318 |
|
| 319 |
with gr.Accordion("Advanced Settings", open=False):
|
| 320 |
-
real_seed_src = gr.Number(label="Source Seed", value=
|
| 321 |
-
real_seed_obj = gr.Number(label="Object Seed", value=
|
| 322 |
real_extended_scale = gr.Slider(
|
| 323 |
label="Extended Scale",
|
| 324 |
minimum=1.0,
|
|
|
|
| 216 |
)
|
| 217 |
gen_prompt_target = gr.Textbox(
|
| 218 |
label="Target Prompt",
|
| 219 |
+
placeholder="A photo of a cat wearing a blue hat sitting on the couch",
|
| 220 |
+
value="A photo of a cat wearing a blue hat sitting on the couch"
|
| 221 |
)
|
| 222 |
gen_subject_token = gr.Textbox(
|
| 223 |
label="Subject Token",
|
|
|
|
| 227 |
)
|
| 228 |
|
| 229 |
with gr.Accordion("Advanced Settings", open=False):
|
| 230 |
+
gen_seed_src = gr.Number(label="Source Seed", value=1, precision=0)
|
| 231 |
+
gen_seed_obj = gr.Number(label="Object Seed", value=42, precision=0)
|
| 232 |
gen_extended_scale = gr.Slider(
|
| 233 |
label="Extended Scale",
|
| 234 |
minimum=1.0,
|
|
|
|
| 283 |
gr.Examples(
|
| 284 |
examples=[
|
| 285 |
["A photo of a man sitting on a bench", "A photo of a man sitting on a bench with a dog", "dog"],
|
| 286 |
+
["A photo of a cat sitting on the couch", "A photo of a cat wearing a blue hat sitting on the couch", "hat"],
|
| 287 |
["A car driving through an empty street", "A pink car driving through an empty street", "car"]
|
| 288 |
],
|
| 289 |
inputs=[
|
|
|
|
| 317 |
)
|
| 318 |
|
| 319 |
with gr.Accordion("Advanced Settings", open=False):
|
| 320 |
+
real_seed_src = gr.Number(label="Source Seed", value=1, precision=0)
|
| 321 |
+
real_seed_obj = gr.Number(label="Object Seed", value=0, precision=0)
|
| 322 |
real_extended_scale = gr.Slider(
|
| 323 |
label="Extended Scale",
|
| 324 |
minimum=1.0,
|