Spaces:

nvidia
/

addit

Running on Zero

File size: 5,107 Bytes

# Copyright (C) 2025 NVIDIA Corporation.  All rights reserved.
#
# This work is licensed under the LICENSE file
# located at the root directory.

import gc
import torch
from visualization_utils import show_images

def _add_object(
    pipe,
    prompts,
    seed_src,
    seed_obj,
    extended_scale,
    source_latents,
    structure_transfer_step,
    subject_token,
    blend_steps,
    show_attention=False,
    localization_model="attention_points_sam",
    is_img_src=False,
    img_src_latents=None,
    use_offset=False,
    display_output=False,
):
    gc.collect()
    torch.cuda.empty_cache()

    out = pipe( 
        prompt=prompts,
        guidance_scale=3.5 if (not is_img_src) else [1,3.5],
        height=1024,
        width=1024,
        max_sequence_length=512,
        num_inference_steps=30,
        seed=[seed_src, seed_obj],
        
        # Extended Attention
        extended_scale=extended_scale,
        extended_steps_multi=10,
        extended_steps_single=20,
        
        # Structure Transfer
        source_latents=source_latents,
        structure_transfer_step=structure_transfer_step,
        
        # Latent Blending
        subject_token=subject_token,
        localization_model=localization_model,
        blend_steps=blend_steps,
        show_attention=show_attention,
        
        # Real Image Source
        is_img_src=is_img_src,
        img_src_latents=img_src_latents,
        use_offset=use_offset,

        # TQDM
        tqdm_desc="Running Addit: Generating Edited Image",
    )

    if display_output:
        show_images(out.images)

    return out.images

def add_object_generated(
    pipe,
    prompt_source,
    prompt_object,
    subject_token,
    seed_src,
    seed_obj,
    show_attention=False,
    extended_scale=1.05,
    structure_transfer_step=2,
    blend_steps=[15],
    localization_model="attention_points_sam",
    display_output=False
):
    gc.collect()
    torch.cuda.empty_cache()

    # Generate source image and latents for each seed1
    print('Generating source image...')
    source_image, source_latents = pipe(
        prompt=[prompt_source],
        guidance_scale=3.5,
        height=1024,
        width=1024,
        max_sequence_length=512,
        num_inference_steps=30,
        seed=[seed_src],
        output_type="both",
        tqdm_desc="Generating Source Image",
    )
    source_image = source_image[0]

    # Run the core combination logic
    print('Running Addit...')
    src_image, edited_image = _add_object(
        pipe=pipe,
        prompts=[prompt_source, prompt_object],
        subject_token=subject_token,
        seed_src=seed_src,
        seed_obj=seed_obj,
        source_latents=source_latents,
        structure_transfer_step=structure_transfer_step,
        extended_scale=extended_scale,
        blend_steps=blend_steps,
        show_attention=show_attention,
        localization_model=localization_model,
        display_output=display_output
    )

    return src_image, edited_image

def add_object_real(
    pipe,
    source_image,
    prompt_source,
    prompt_object,
    subject_token,
    seed_src,
    seed_obj,
    localization_model="attention_points_sam",
    extended_scale=1.05,
    structure_transfer_step=4,
    blend_steps=[20],
    use_offset=False,
    show_attention=False,
    use_inversion=False,
    display_output=False
):
    print('Noising-Denoising Original Image')
    gc.collect()
    torch.cuda.empty_cache()

    # Get initial latents
    source_latents = pipe.call_img2img(
        prompt=prompt_source,
        image=source_image,
        num_inference_steps=30,
        strength=0.1,
        guidance_scale=3.5,
        output_type="latent",
        generator=torch.Generator(device=pipe.device).manual_seed(0),
        tqdm_desc="Encoding Source Image",
    ).images

    # Optional inversion step
    img_src_latents = None
    if use_inversion:
        print('Inverting Image')
        gc.collect()
        torch.cuda.empty_cache()

        latents_list = pipe.call_invert(
            prompt=prompt_source,
            image=source_latents,
            num_inference_steps=30,
            guidance_scale=1,
            fixed_point_iterations=2,
            generator=torch.Generator(device=pipe.device).manual_seed(0),   
            tqdm_desc="Inverting Source Image",
        )
        img_src_latents = [x[0] for x in latents_list][::-1]

    print('Running Addit')
    gc.collect()
    torch.cuda.empty_cache()

    src_image, edited_image = _add_object(
        pipe,
        prompts=[prompt_source, prompt_object],
        seed_src=seed_src,
        seed_obj=seed_obj,
        extended_scale=extended_scale,
        source_latents=source_latents,
        structure_transfer_step=structure_transfer_step,
        subject_token=subject_token,
        blend_steps=blend_steps,
        show_attention=show_attention,
        localization_model=localization_model,
        is_img_src=True,
        img_src_latents=img_src_latents,
        use_offset=use_offset,
        display_output=display_output,
    )

    return src_image, edited_image