Update tqdm descriptions
Browse files- addit_flux_pipeline.py +159 -164
- addit_methods.py +8 -2
- app.py +7 -7
addit_flux_pipeline.py
CHANGED
@@ -17,6 +17,7 @@
|
|
17 |
# This work is licensed under the LICENSE file
|
18 |
# located at the root directory.
|
19 |
|
|
|
20 |
from typing import Any, Callable, Dict, List, Optional, Union
|
21 |
import torch
|
22 |
import numpy as np
|
@@ -175,6 +176,9 @@ class AdditFluxPipeline(FluxPipeline):
|
|
175 |
is_img_src: bool = False,
|
176 |
use_offset: bool = False,
|
177 |
img_src_latents: Optional[List[torch.FloatTensor]] = None,
|
|
|
|
|
|
|
178 |
):
|
179 |
r"""
|
180 |
Function invoked when calling the pipeline for generation.
|
@@ -401,51 +405,25 @@ class AdditFluxPipeline(FluxPipeline):
|
|
401 |
img_src_latents.append((1.0 - sigma) * source_latents[0] + sigma * rand_noise)
|
402 |
|
403 |
# 6. Denoising loop
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
continue
|
408 |
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
# For denoising from source image
|
413 |
-
if is_img_src:
|
414 |
-
latents[0] = img_src_latents[i]
|
415 |
-
|
416 |
-
# For Structure Transfer
|
417 |
-
if (source_latents is not None) and i == structure_transfer_step:
|
418 |
-
sigma = self.scheduler.sigmas[i]
|
419 |
-
latents[1] = (1.0 - sigma) * source_latents[0] + sigma * noise[1]
|
420 |
-
|
421 |
-
if is_auto_extend_scale and i == auto_extended_step:
|
422 |
-
def f(gamma):
|
423 |
-
self.attention_store.attention_ratios[i] = {}
|
424 |
-
noise_pred = self.transformer(
|
425 |
-
hidden_states=latents,
|
426 |
-
timestep=timestep / 1000,
|
427 |
-
guidance=guidance,
|
428 |
-
pooled_projections=pooled_prompt_embeds,
|
429 |
-
encoder_hidden_states=prompt_embeds,
|
430 |
-
txt_ids=text_ids,
|
431 |
-
img_ids=latent_image_ids,
|
432 |
-
joint_attention_kwargs=self.joint_attention_kwargs,
|
433 |
-
return_dict=False,
|
434 |
-
proccesor_kwargs={"step_index": i, "extended_scale": gamma},
|
435 |
-
)[0]
|
436 |
-
|
437 |
-
scores_per_layer = self.attention_store.get_attention_ratios(step_indices=[i], display_imgs=False)
|
438 |
-
source_sum, text_sum, target_sum = scores_per_layer['transformer_blocks']
|
439 |
-
|
440 |
-
# We want to find the gamma that makes the ratio equal to K
|
441 |
-
ratio = (target_sum / source_sum)
|
442 |
-
return (ratio - target_auto_ratio)
|
443 |
-
|
444 |
-
gamma_sol = brentq(f, 1.0, 1.2, xtol=0.01)
|
445 |
|
446 |
-
|
447 |
-
|
448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
noise_pred = self.transformer(
|
450 |
hidden_states=latents,
|
451 |
timestep=timestep / 1000,
|
@@ -456,47 +434,68 @@ class AdditFluxPipeline(FluxPipeline):
|
|
456 |
img_ids=latent_image_ids,
|
457 |
joint_attention_kwargs=self.joint_attention_kwargs,
|
458 |
return_dict=False,
|
459 |
-
proccesor_kwargs={"step_index": i, "extended_scale":
|
460 |
-
)[0]
|
461 |
|
462 |
-
|
463 |
-
|
464 |
-
latents, x0 = self.scheduler.step(noise_pred, t, latents, return_dict=False, step_index=i)
|
465 |
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
|
|
470 |
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
477 |
|
478 |
-
|
|
|
|
|
479 |
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
|
|
490 |
|
491 |
-
|
492 |
-
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
493 |
|
494 |
-
|
495 |
-
if
|
496 |
-
|
|
|
497 |
|
498 |
-
|
499 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
|
501 |
if output_type == "latent":
|
502 |
image = latents
|
@@ -793,6 +792,9 @@ class AdditFluxPipeline(FluxPipeline):
|
|
793 |
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
794 |
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
795 |
max_sequence_length: int = 512,
|
|
|
|
|
|
|
796 |
):
|
797 |
r"""
|
798 |
Function invoked when calling the pipeline for generation.
|
@@ -987,49 +989,44 @@ class AdditFluxPipeline(FluxPipeline):
|
|
987 |
latent_image_ids = latent_image_ids.expand(latents.shape[0], -1, -1)
|
988 |
|
989 |
# 6. Denoising loop
|
990 |
-
|
991 |
-
|
992 |
-
|
993 |
-
|
994 |
-
|
995 |
-
|
996 |
-
|
997 |
-
|
998 |
-
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
-
|
1004 |
-
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
-
|
1010 |
-
|
1011 |
-
|
1012 |
-
|
1013 |
-
if
|
1014 |
-
|
1015 |
-
|
1016 |
-
|
1017 |
-
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
1021 |
-
|
1022 |
-
|
1023 |
-
|
1024 |
-
|
1025 |
-
|
1026 |
-
|
1027 |
-
|
1028 |
-
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
1029 |
-
progress_bar.update()
|
1030 |
-
|
1031 |
-
# if XLA_AVAILABLE:
|
1032 |
-
# xm.mark_step()
|
1033 |
|
1034 |
if output_type == "latent":
|
1035 |
image = latents
|
@@ -1126,6 +1123,9 @@ class AdditFluxPipeline(FluxPipeline):
|
|
1126 |
max_sequence_length: int = 512,
|
1127 |
|
1128 |
fixed_point_iterations: int = 1,
|
|
|
|
|
|
|
1129 |
):
|
1130 |
r"""
|
1131 |
Function invoked when calling the pipeline for generation.
|
@@ -1328,60 +1328,55 @@ class AdditFluxPipeline(FluxPipeline):
|
|
1328 |
latents_list.append(latents)
|
1329 |
|
1330 |
# 6. Denoising loop
|
1331 |
-
|
1332 |
-
|
1333 |
-
|
1334 |
-
|
1335 |
-
|
1336 |
-
continue
|
1337 |
-
|
1338 |
-
if j == 0:
|
1339 |
-
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
1340 |
-
timestep = timesteps[i].expand(latents.shape[0]).to(latents.dtype)
|
1341 |
-
else:
|
1342 |
-
timestep = timesteps_one_start[i].expand(latents.shape[0]).to(latents.dtype)
|
1343 |
|
1344 |
-
|
1345 |
-
|
1346 |
-
|
1347 |
-
|
1348 |
-
|
1349 |
-
pooled_projections=pooled_prompt_embeds,
|
1350 |
-
encoder_hidden_states=prompt_embeds,
|
1351 |
-
txt_ids=text_ids,
|
1352 |
-
img_ids=latent_image_ids,
|
1353 |
-
joint_attention_kwargs=self.joint_attention_kwargs,
|
1354 |
-
return_dict=False,
|
1355 |
-
)[0]
|
1356 |
|
1357 |
-
|
1358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1359 |
|
1360 |
-
|
1361 |
-
|
1362 |
|
1363 |
-
|
1364 |
-
|
1365 |
-
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
1366 |
-
latents = latents.to(latents_dtype)
|
1367 |
|
1368 |
-
|
1369 |
-
|
1370 |
-
|
1371 |
-
|
1372 |
-
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
1373 |
|
1374 |
-
|
1375 |
-
|
|
|
|
|
|
|
1376 |
|
1377 |
-
|
1378 |
-
|
1379 |
-
progress_bar.update()
|
1380 |
|
1381 |
-
|
1382 |
-
|
1383 |
-
|
1384 |
-
|
1385 |
|
1386 |
# Offload all models
|
1387 |
self.maybe_free_model_hooks()
|
|
|
17 |
# This work is licensed under the LICENSE file
|
18 |
# located at the root directory.
|
19 |
|
20 |
+
from tqdm import tqdm
|
21 |
from typing import Any, Callable, Dict, List, Optional, Union
|
22 |
import torch
|
23 |
import numpy as np
|
|
|
176 |
is_img_src: bool = False,
|
177 |
use_offset: bool = False,
|
178 |
img_src_latents: Optional[List[torch.FloatTensor]] = None,
|
179 |
+
|
180 |
+
# TQDM
|
181 |
+
tqdm_desc: str = "Denoising",
|
182 |
):
|
183 |
r"""
|
184 |
Function invoked when calling the pipeline for generation.
|
|
|
405 |
img_src_latents.append((1.0 - sigma) * source_latents[0] + sigma * rand_noise)
|
406 |
|
407 |
# 6. Denoising loop
|
408 |
+
for i, t in enumerate(tqdm(timesteps, desc=tqdm_desc)):
|
409 |
+
if self.interrupt:
|
410 |
+
continue
|
|
|
411 |
|
412 |
+
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
413 |
+
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
|
415 |
+
# For denoising from source image
|
416 |
+
if is_img_src:
|
417 |
+
latents[0] = img_src_latents[i]
|
418 |
+
|
419 |
+
# For Structure Transfer
|
420 |
+
if (source_latents is not None) and i == structure_transfer_step:
|
421 |
+
sigma = self.scheduler.sigmas[i]
|
422 |
+
latents[1] = (1.0 - sigma) * source_latents[0] + sigma * noise[1]
|
423 |
+
|
424 |
+
if is_auto_extend_scale and i == auto_extended_step:
|
425 |
+
def f(gamma):
|
426 |
+
self.attention_store.attention_ratios[i] = {}
|
427 |
noise_pred = self.transformer(
|
428 |
hidden_states=latents,
|
429 |
timestep=timestep / 1000,
|
|
|
434 |
img_ids=latent_image_ids,
|
435 |
joint_attention_kwargs=self.joint_attention_kwargs,
|
436 |
return_dict=False,
|
437 |
+
proccesor_kwargs={"step_index": i, "extended_scale": gamma},
|
438 |
+
)[0]
|
439 |
|
440 |
+
scores_per_layer = self.attention_store.get_attention_ratios(step_indices=[i], display_imgs=False)
|
441 |
+
source_sum, text_sum, target_sum = scores_per_layer['transformer_blocks']
|
|
|
442 |
|
443 |
+
# We want to find the gamma that makes the ratio equal to K
|
444 |
+
ratio = (target_sum / source_sum)
|
445 |
+
return (ratio - target_auto_ratio)
|
446 |
+
|
447 |
+
gamma_sol = brentq(f, 1.0, 1.2, xtol=0.01)
|
448 |
|
449 |
+
print('Chosen gamma:', gamma_sol)
|
450 |
+
extended_scale = gamma_sol
|
451 |
+
else:
|
452 |
+
noise_pred = self.transformer(
|
453 |
+
hidden_states=latents,
|
454 |
+
timestep=timestep / 1000,
|
455 |
+
guidance=guidance,
|
456 |
+
pooled_projections=pooled_prompt_embeds,
|
457 |
+
encoder_hidden_states=prompt_embeds,
|
458 |
+
txt_ids=text_ids,
|
459 |
+
img_ids=latent_image_ids,
|
460 |
+
joint_attention_kwargs=self.joint_attention_kwargs,
|
461 |
+
return_dict=False,
|
462 |
+
proccesor_kwargs={"step_index": i, "extended_scale": extended_scale},
|
463 |
+
)[0]
|
464 |
|
465 |
+
# compute the previous noisy sample x_t -> x_t-1
|
466 |
+
latents_dtype = latents.dtype
|
467 |
+
latents, x0 = self.scheduler.step(noise_pred, t, latents, return_dict=False, step_index=i)
|
468 |
|
469 |
+
if use_offset and is_img_src and (i+1 < len(img_src_latents)):
|
470 |
+
next_latent = img_src_latents[i+1]
|
471 |
+
offset = (next_latent - latents[0])
|
472 |
+
latents[1] = latents[1] + offset
|
473 |
|
474 |
+
# blend latents
|
475 |
+
if i in blend_steps and (subject_token is not None) and (localization_model is not None):
|
476 |
+
x0 = self._unpack_latents(x0, height, width, self.vae_scale_factor)
|
477 |
+
x0 = (x0 / self.vae.config.scaling_factor) + self.vae.config.shift_factor
|
478 |
+
images = self.vae.decode(x0, return_dict=False)[0]
|
479 |
+
images = self.image_processor.postprocess(images, output_type="pil")
|
480 |
|
481 |
+
self.do_step_blend(images, latents, subject_token, localization_model, show_attention, i, blend_models)
|
|
|
482 |
|
483 |
+
if latents.dtype != latents_dtype:
|
484 |
+
if torch.backends.mps.is_available():
|
485 |
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
486 |
+
latents = latents.to(latents_dtype)
|
487 |
|
488 |
+
if callback_on_step_end is not None:
|
489 |
+
callback_kwargs = {}
|
490 |
+
for k in callback_on_step_end_tensor_inputs:
|
491 |
+
callback_kwargs[k] = locals()[k]
|
492 |
+
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
493 |
+
|
494 |
+
latents = callback_outputs.pop("latents", latents)
|
495 |
+
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
496 |
+
|
497 |
+
# if XLA_AVAILABLE:
|
498 |
+
# xm.mark_step()
|
499 |
|
500 |
if output_type == "latent":
|
501 |
image = latents
|
|
|
792 |
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
793 |
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
794 |
max_sequence_length: int = 512,
|
795 |
+
|
796 |
+
# TQDM
|
797 |
+
tqdm_desc: str = "Denoising",
|
798 |
):
|
799 |
r"""
|
800 |
Function invoked when calling the pipeline for generation.
|
|
|
989 |
latent_image_ids = latent_image_ids.expand(latents.shape[0], -1, -1)
|
990 |
|
991 |
# 6. Denoising loop
|
992 |
+
for i, t in enumerate(tqdm(timesteps, desc=tqdm_desc)):
|
993 |
+
if self.interrupt:
|
994 |
+
continue
|
995 |
+
|
996 |
+
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
997 |
+
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
998 |
+
noise_pred = self.transformer(
|
999 |
+
hidden_states=latents,
|
1000 |
+
timestep=timestep / 1000,
|
1001 |
+
guidance=guidance,
|
1002 |
+
pooled_projections=pooled_prompt_embeds,
|
1003 |
+
encoder_hidden_states=prompt_embeds,
|
1004 |
+
txt_ids=text_ids,
|
1005 |
+
img_ids=latent_image_ids,
|
1006 |
+
joint_attention_kwargs=self.joint_attention_kwargs,
|
1007 |
+
return_dict=False,
|
1008 |
+
)[0]
|
1009 |
+
|
1010 |
+
# compute the previous noisy sample x_t -> x_t-1
|
1011 |
+
latents_dtype = latents.dtype
|
1012 |
+
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
1013 |
+
|
1014 |
+
if latents.dtype != latents_dtype:
|
1015 |
+
if torch.backends.mps.is_available():
|
1016 |
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
1017 |
+
latents = latents.to(latents_dtype)
|
1018 |
+
|
1019 |
+
if callback_on_step_end is not None:
|
1020 |
+
callback_kwargs = {}
|
1021 |
+
for k in callback_on_step_end_tensor_inputs:
|
1022 |
+
callback_kwargs[k] = locals()[k]
|
1023 |
+
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
1024 |
+
|
1025 |
+
latents = callback_outputs.pop("latents", latents)
|
1026 |
+
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
1027 |
+
|
1028 |
+
# if XLA_AVAILABLE:
|
1029 |
+
# xm.mark_step()
|
|
|
|
|
|
|
|
|
|
|
1030 |
|
1031 |
if output_type == "latent":
|
1032 |
image = latents
|
|
|
1123 |
max_sequence_length: int = 512,
|
1124 |
|
1125 |
fixed_point_iterations: int = 1,
|
1126 |
+
|
1127 |
+
# TQDM
|
1128 |
+
tqdm_desc: str = "Denoising",
|
1129 |
):
|
1130 |
r"""
|
1131 |
Function invoked when calling the pipeline for generation.
|
|
|
1328 |
latents_list.append(latents)
|
1329 |
|
1330 |
# 6. Denoising loop
|
1331 |
+
for i, t in enumerate(tqdm(timesteps, desc=tqdm_desc)):
|
1332 |
+
original_latents = latents.clone()
|
1333 |
+
for j in range(fixed_point_iterations):
|
1334 |
+
if self.interrupt:
|
1335 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1336 |
|
1337 |
+
if j == 0:
|
1338 |
+
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
1339 |
+
timestep = timesteps[i].expand(latents.shape[0]).to(latents.dtype)
|
1340 |
+
else:
|
1341 |
+
timestep = timesteps_one_start[i].expand(latents.shape[0]).to(latents.dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1342 |
|
1343 |
+
noise_pred = self.transformer(
|
1344 |
+
hidden_states=latents,
|
1345 |
+
# YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
|
1346 |
+
timestep=timestep / 1000,
|
1347 |
+
guidance=guidance,
|
1348 |
+
pooled_projections=pooled_prompt_embeds,
|
1349 |
+
encoder_hidden_states=prompt_embeds,
|
1350 |
+
txt_ids=text_ids,
|
1351 |
+
img_ids=latent_image_ids,
|
1352 |
+
joint_attention_kwargs=self.joint_attention_kwargs,
|
1353 |
+
return_dict=False,
|
1354 |
+
)[0]
|
1355 |
|
1356 |
+
# compute the previous noisy sample x_t -> x_t-1
|
1357 |
+
latents_dtype = latents.dtype
|
1358 |
|
1359 |
+
# noise_pred = -noise_pred
|
1360 |
+
latents = self.scheduler.step(noise_pred, t, original_latents, return_dict=False, step_index=i)[0]
|
|
|
|
|
1361 |
|
1362 |
+
if latents.dtype != latents_dtype:
|
1363 |
+
if torch.backends.mps.is_available():
|
1364 |
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
1365 |
+
latents = latents.to(latents_dtype)
|
|
|
1366 |
|
1367 |
+
if callback_on_step_end is not None:
|
1368 |
+
callback_kwargs = {}
|
1369 |
+
for k in callback_on_step_end_tensor_inputs:
|
1370 |
+
callback_kwargs[k] = locals()[k]
|
1371 |
+
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
1372 |
|
1373 |
+
latents = callback_outputs.pop("latents", latents)
|
1374 |
+
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
|
|
1375 |
|
1376 |
+
# if XLA_AVAILABLE:
|
1377 |
+
# xm.mark_step()
|
1378 |
+
|
1379 |
+
latents_list.append(latents)
|
1380 |
|
1381 |
# Offload all models
|
1382 |
self.maybe_free_model_hooks()
|
addit_methods.py
CHANGED
@@ -55,6 +55,9 @@ def _add_object(
|
|
55 |
is_img_src=is_img_src,
|
56 |
img_src_latents=img_src_latents,
|
57 |
use_offset=use_offset,
|
|
|
|
|
|
|
58 |
)
|
59 |
|
60 |
if display_output:
|
@@ -90,6 +93,7 @@ def add_object_generated(
|
|
90 |
num_inference_steps=30,
|
91 |
seed=[seed_src],
|
92 |
output_type="both",
|
|
|
93 |
)
|
94 |
source_image = source_image[0]
|
95 |
|
@@ -141,7 +145,8 @@ def add_object_real(
|
|
141 |
strength=0.1,
|
142 |
guidance_scale=3.5,
|
143 |
output_type="latent",
|
144 |
-
generator=torch.Generator(device=pipe.device).manual_seed(0)
|
|
|
145 |
).images
|
146 |
|
147 |
# Optional inversion step
|
@@ -157,7 +162,8 @@ def add_object_real(
|
|
157 |
num_inference_steps=30,
|
158 |
guidance_scale=1,
|
159 |
fixed_point_iterations=2,
|
160 |
-
generator=torch.Generator(device=pipe.device).manual_seed(0)
|
|
|
161 |
)
|
162 |
img_src_latents = [x[0] for x in latents_list][::-1]
|
163 |
|
|
|
55 |
is_img_src=is_img_src,
|
56 |
img_src_latents=img_src_latents,
|
57 |
use_offset=use_offset,
|
58 |
+
|
59 |
+
# TQDM
|
60 |
+
tqdm_desc="Running Addit: Generating Edited Image",
|
61 |
)
|
62 |
|
63 |
if display_output:
|
|
|
93 |
num_inference_steps=30,
|
94 |
seed=[seed_src],
|
95 |
output_type="both",
|
96 |
+
tqdm_desc="Generating Source Image",
|
97 |
)
|
98 |
source_image = source_image[0]
|
99 |
|
|
|
145 |
strength=0.1,
|
146 |
guidance_scale=3.5,
|
147 |
output_type="latent",
|
148 |
+
generator=torch.Generator(device=pipe.device).manual_seed(0),
|
149 |
+
tqdm_desc="Encoding Source Image",
|
150 |
).images
|
151 |
|
152 |
# Optional inversion step
|
|
|
162 |
num_inference_steps=30,
|
163 |
guidance_scale=1,
|
164 |
fixed_point_iterations=2,
|
165 |
+
generator=torch.Generator(device=pipe.device).manual_seed(0),
|
166 |
+
tqdm_desc="Inverting Source Image",
|
167 |
)
|
168 |
img_src_latents = [x[0] for x in latents_list][::-1]
|
169 |
|
app.py
CHANGED
@@ -216,8 +216,8 @@ def create_interface():
|
|
216 |
)
|
217 |
gen_prompt_target = gr.Textbox(
|
218 |
label="Target Prompt",
|
219 |
-
placeholder="A photo of a cat wearing a
|
220 |
-
value="A photo of a cat wearing a
|
221 |
)
|
222 |
gen_subject_token = gr.Textbox(
|
223 |
label="Subject Token",
|
@@ -227,8 +227,8 @@ def create_interface():
|
|
227 |
)
|
228 |
|
229 |
with gr.Accordion("Advanced Settings", open=False):
|
230 |
-
gen_seed_src = gr.Number(label="Source Seed", value=
|
231 |
-
gen_seed_obj = gr.Number(label="Object Seed", value=
|
232 |
gen_extended_scale = gr.Slider(
|
233 |
label="Extended Scale",
|
234 |
minimum=1.0,
|
@@ -283,7 +283,7 @@ def create_interface():
|
|
283 |
gr.Examples(
|
284 |
examples=[
|
285 |
["A photo of a man sitting on a bench", "A photo of a man sitting on a bench with a dog", "dog"],
|
286 |
-
["A photo of a cat sitting on the couch", "A photo of a cat wearing a
|
287 |
["A car driving through an empty street", "A pink car driving through an empty street", "car"]
|
288 |
],
|
289 |
inputs=[
|
@@ -317,8 +317,8 @@ def create_interface():
|
|
317 |
)
|
318 |
|
319 |
with gr.Accordion("Advanced Settings", open=False):
|
320 |
-
real_seed_src = gr.Number(label="Source Seed", value=
|
321 |
-
real_seed_obj = gr.Number(label="Object Seed", value=
|
322 |
real_extended_scale = gr.Slider(
|
323 |
label="Extended Scale",
|
324 |
minimum=1.0,
|
|
|
216 |
)
|
217 |
gen_prompt_target = gr.Textbox(
|
218 |
label="Target Prompt",
|
219 |
+
placeholder="A photo of a cat wearing a blue hat sitting on the couch",
|
220 |
+
value="A photo of a cat wearing a blue hat sitting on the couch"
|
221 |
)
|
222 |
gen_subject_token = gr.Textbox(
|
223 |
label="Subject Token",
|
|
|
227 |
)
|
228 |
|
229 |
with gr.Accordion("Advanced Settings", open=False):
|
230 |
+
gen_seed_src = gr.Number(label="Source Seed", value=1, precision=0)
|
231 |
+
gen_seed_obj = gr.Number(label="Object Seed", value=42, precision=0)
|
232 |
gen_extended_scale = gr.Slider(
|
233 |
label="Extended Scale",
|
234 |
minimum=1.0,
|
|
|
283 |
gr.Examples(
|
284 |
examples=[
|
285 |
["A photo of a man sitting on a bench", "A photo of a man sitting on a bench with a dog", "dog"],
|
286 |
+
["A photo of a cat sitting on the couch", "A photo of a cat wearing a blue hat sitting on the couch", "hat"],
|
287 |
["A car driving through an empty street", "A pink car driving through an empty street", "car"]
|
288 |
],
|
289 |
inputs=[
|
|
|
317 |
)
|
318 |
|
319 |
with gr.Accordion("Advanced Settings", open=False):
|
320 |
+
real_seed_src = gr.Number(label="Source Seed", value=1, precision=0)
|
321 |
+
real_seed_obj = gr.Number(label="Object Seed", value=0, precision=0)
|
322 |
real_extended_scale = gr.Slider(
|
323 |
label="Extended Scale",
|
324 |
minimum=1.0,
|