# Copyright (C) 2025 NVIDIA Corporation. All rights reserved. # # This work is licensed under the LICENSE file # located at the root directory. import gc import torch from visualization_utils import show_images def _add_object( pipe, prompts, seed_src, seed_obj, extended_scale, source_latents, structure_transfer_step, subject_token, blend_steps, show_attention=False, localization_model="attention_points_sam", is_img_src=False, img_src_latents=None, use_offset=False, display_output=False, ): gc.collect() torch.cuda.empty_cache() out = pipe( prompt=prompts, guidance_scale=3.5 if (not is_img_src) else [1,3.5], height=1024, width=1024, max_sequence_length=512, num_inference_steps=30, seed=[seed_src, seed_obj], # Extended Attention extended_scale=extended_scale, extended_steps_multi=10, extended_steps_single=20, # Structure Transfer source_latents=source_latents, structure_transfer_step=structure_transfer_step, # Latent Blending subject_token=subject_token, localization_model=localization_model, blend_steps=blend_steps, show_attention=show_attention, # Real Image Source is_img_src=is_img_src, img_src_latents=img_src_latents, use_offset=use_offset, # TQDM tqdm_desc="Running Addit: Generating Edited Image", ) if display_output: show_images(out.images) return out.images def add_object_generated( pipe, prompt_source, prompt_object, subject_token, seed_src, seed_obj, show_attention=False, extended_scale=1.05, structure_transfer_step=2, blend_steps=[15], localization_model="attention_points_sam", display_output=False ): gc.collect() torch.cuda.empty_cache() # Generate source image and latents for each seed1 print('Generating source image...') source_image, source_latents = pipe( prompt=[prompt_source], guidance_scale=3.5, height=1024, width=1024, max_sequence_length=512, num_inference_steps=30, seed=[seed_src], output_type="both", tqdm_desc="Generating Source Image", ) source_image = source_image[0] # Run the core combination logic print('Running Addit...') src_image, edited_image = _add_object( pipe=pipe, prompts=[prompt_source, prompt_object], subject_token=subject_token, seed_src=seed_src, seed_obj=seed_obj, source_latents=source_latents, structure_transfer_step=structure_transfer_step, extended_scale=extended_scale, blend_steps=blend_steps, show_attention=show_attention, localization_model=localization_model, display_output=display_output ) return src_image, edited_image def add_object_real( pipe, source_image, prompt_source, prompt_object, subject_token, seed_src, seed_obj, localization_model="attention_points_sam", extended_scale=1.05, structure_transfer_step=4, blend_steps=[20], use_offset=False, show_attention=False, use_inversion=False, display_output=False ): print('Noising-Denoising Original Image') gc.collect() torch.cuda.empty_cache() # Get initial latents source_latents = pipe.call_img2img( prompt=prompt_source, image=source_image, num_inference_steps=30, strength=0.1, guidance_scale=3.5, output_type="latent", generator=torch.Generator(device=pipe.device).manual_seed(0), tqdm_desc="Encoding Source Image", ).images # Optional inversion step img_src_latents = None if use_inversion: print('Inverting Image') gc.collect() torch.cuda.empty_cache() latents_list = pipe.call_invert( prompt=prompt_source, image=source_latents, num_inference_steps=30, guidance_scale=1, fixed_point_iterations=2, generator=torch.Generator(device=pipe.device).manual_seed(0), tqdm_desc="Inverting Source Image", ) img_src_latents = [x[0] for x in latents_list][::-1] print('Running Addit') gc.collect() torch.cuda.empty_cache() src_image, edited_image = _add_object( pipe, prompts=[prompt_source, prompt_object], seed_src=seed_src, seed_obj=seed_obj, extended_scale=extended_scale, source_latents=source_latents, structure_transfer_step=structure_transfer_step, subject_token=subject_token, blend_steps=blend_steps, show_attention=show_attention, localization_model=localization_model, is_img_src=True, img_src_latents=img_src_latents, use_offset=use_offset, display_output=display_output, ) return src_image, edited_image