Spaces:

Collov-Labs
/

d-edit

Runtime error

App Files Files Community

afeng commited on Sep 4, 2024

Commit

d807efd

0 Parent(s):

first

Browse files

Files changed (40) hide show

.gitignore +171 -0
README.md +158 -0
app.py +349 -0
assets/demo1.gif +0 -0
assets/demo2.gif +0 -0
assets/demo3.gif +0 -0
assets/demo4.gif +0 -0
assets/mask_def.png +0 -0
controller.py +156 -0
example2/img.png +0 -0
main.py +424 -0
pipeline_dedit_sd.py +813 -0
pipeline_dedit_sdxl.py +875 -0
requirements.txt +12 -0
scripts/run_segment.sh +3 -0
scripts/run_segmentSAM.sh +2 -0
scripts/sd/run_ft_sd_512.sh +11 -0
scripts/sd/run_ft_sd_512_2imgs.sh +14 -0
scripts/sd/run_image.sh +15 -0
scripts/sd/run_move_resize.sh +18 -0
scripts/sd/run_recon.sh +11 -0
scripts/sd/run_remove.sh +55 -0
scripts/sd/run_text.sh +33 -0
scripts/sdxl/run_ft_sdxl_1024.sh +12 -0
scripts/sdxl/run_ft_sdxl_1024_2imgs.sh +14 -0
scripts/sdxl/run_ft_sdxl_1024_auxin_todo.sh +29 -0
scripts/sdxl/run_ft_sdxl_1024_fulllora.sh +12 -0
scripts/sdxl/run_ft_sdxl_1024_fulllora_2imgs.sh +41 -0
scripts/sdxl/run_image.sh +15 -0
scripts/sdxl/run_image_w_edited_mask.sh +16 -0
scripts/sdxl/run_move_resize.sh +21 -0
scripts/sdxl/run_recon.sh +11 -0
scripts/sdxl/run_recon_item_todo.sh +27 -0
scripts/sdxl/run_remove.sh +17 -0
scripts/sdxl/run_text.sh +14 -0
scripts/sdxl/run_text_w_edited_mask.sh +15 -0
segment.py +115 -0
segment_sam.py +294 -0
utils.py +326 -0
utils_mask.py +296 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+example1_512/
+example1_1024/
+example1_example2_512/
+example1_example2_1024/
+example1/
+old/
+out_active.png
+out_mask.png
+out_soft.png
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md ADDED Viewed

	@@ -0,0 +1,158 @@

+<div align="center">
+<h1>An item is Worth a Prompt: Versatile Image Editing with Disentangled Control</h1>
+<a href='https://arxiv.org/abs/2403.04880'><img src='https://img.shields.io/badge/Technique-Report-red'></a>
+</div>
+D-Edit is a versatile image editing framework based on diffusion models, supporting text, image, mask-based editing.
+<!-- <img src='assets/applications.png'> -->
+## Release
+- [2024/03/12] 🔥 Code uploaded.
+## 🔥 Examples
+<p align="center">
+  <img alt="text" src="assets/demo1.gif" width="45%">
+&nbsp; &nbsp; &nbsp; &nbsp;
+  <img alt="image" src="assets/demo2.gif" width="45%">
+</p>
+1. **Text-Guided Editing**：Allows users to select an object within an image and replace or refine it based on a text description.
+   - Key features:
+     - Generates more realistic details and smoother transitions than alternative methods
+     - Focuses edits specifically on the targeted object
+     - Preserves unrelated parts of the image
+2. **Image-Guided Editing**： Enables users to choose an object from a reference image and transplant it into another image while preserving its identity.
+   - Key features:
+     - Ensures seamless integration of the object into the new context
+     - Adapts the object's appearance to match the target image's style
+     - Works effectively even when the object's appearance differs significantly between reference and target images
+<p align="center">
+  <img alt="mask" src="assets/demo3.gif" width="45%">
+&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
+  <img alt="remove" src="assets/demo4.gif" width="45%">
+</p>
+3. **Mask-Based Editing**： Involves manipulating objects by directly editing their masks.
+   - Key features:
+     - Allows for operations like moving, reshaping, resizing, and refining objects
+     - Fills in new details according to the object's associated prompt
+     - Produces natural-looking results that maintain consistency with the overall image
+4. **Item Removal**: Enables users to remove objects from images by deleting the mask-object associations.
+   - Key features:
+     - Intelligently fills in the empty space left by removed objects
+     - Ensures a coherent final image
+     - Maintains the integrity of the surrounding image elements
+## 🔧 Dependencies and Installation
+- Python >= 3.8 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html))
+- [PyTorch >= 2.1.0](https://pytorch.org/)
+```bash
+conda create --name dedit python=3.10
+conda activate dedit
+pip install -U pip
+# Install requirements
+pip install -r requirements.txt
+```
+## 💻 Run
+### 1. Segmentation
+Put the image (of any resolution) to be edited  into the folder with a specified name, and rename the image as "img.png" or "img.jpg".
+Then run the segmentation model
+```
+sh ./scripts/run_segment.sh
+```
+Alternatively, run [GroundedSAM](https://github.com/IDEA-Research/Grounded-Segment-Anything) to detect with text prompt
+```
+sh ./scripts/run_segmentSAM.sh
+```
+Optionally, if segmentation is not good, refine masks with GUI by locally running the mask editing web:
+```
+python ui_edit_mask.py
+```
+For image-based editing, repeat this step for both reference and target images.
+### 2. Model Finetuning
+Finetune UNet cross-attention layer of diffusion models by running
+```
+sh ./scripts/sdxl/run_ft_sdxl_1024.sh
+```
+or finetune full UNet with lora
+```
+sh ./scripts/sdxl/run_ft_sdxl_1024_fulllora.sh
+```
+If image-based editing is needed, finetune the model with both reference and target images using
+```
+sh ./scripts/sdxl/run_ft_sdxl_1024_fulllora_2imgs.sh
+```
+### 3. Edit \!
+#### 3.1 Reconstruction
+To see if the original image can be constructed
+```
+sh ./scripts/sdxl/run_recon.sh
+```
+#### 3.1 Text-based
+Replace the target item (tgt_index) with the item described by the text prompt (tgt_prompt)
+```
+sh ./scripts/sdxl/run_text.sh
+```
+#### 3.2 Image-based
+Replace the target item (tgt_index) in the target image (tgt_name) with the item (src_index) in the reference image
+```
+sh ./scripts/sdxl/run_image.sh
+```
+#### 3.3 Mask-based
+For target items (tgt_indices_list), resize it (resize_list), move it (delta_x, delta_y) or reshape it by manually editing the mask shape (using UI).
+The resulting new masks (processed by a simple algorithm) can be visualized in './example1/move_resize/seg_move_resize.png', if it is not reasonable, edit using the UI.
+```
+sh ./scripts/sdxl/run_move_resize.sh
+```
+#### 3.4 Remove
+Remove the target item (tgt_index), the remaining region will be reassigned to the nearby regions with a simple algorithm.
+The resulting new masks (processed by a simple algorithm) can be visualized in './example1/remove/seg_removed.png', if it is not reasonable, edit using the UI.
+```
+sh ./scripts/sdxl/run_move_resize.sh
+```
+#### 3.4 General editing parameters
+- We partition the image into three regions as shown below. Regions with the hard mask are frozen, regions with the active mask are generated with diffusion model, and regions with soft mask keep the original content in the first "strength*N" sampling steps.
+<p align="center">
+  <img src="assets/mask_def.png" height=200>
+</p>
+- During editing, if you use an edited segmentation that is different from finetuning, add --load_edited_mask; For mask-based and remove, if you edit the masks automatically processed by the algorithm as mentioned, add --load_edited_processed_mask.
+### Cite
+If you find D-Edit useful for your research and applications, please cite us using this BibTeX:
+```bibtex
+@article{feng2024dedit,
+  title={An item is Worth a Prompt: Versatile Image Editing with Disentangled Control},
+  author={Aosong Feng, Weikang Qiu, Jinbin Bai, Kaicheng Zhou, Zhen Dong, Xiao Zhang, Rex Ying, and Leandros Tassiulas},
+  journal={arXiv preprint arXiv:2403.04880},
+  year={2024}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import os
+import copy
+from PIL import Image
+import matplotlib
+import numpy as np
+import gradio as gr
+from utils import load_mask, load_mask_edit
+from utils_mask import process_mask_to_follow_priority, mask_union, visualize_mask_list_clean
+from pathlib import Path
+import subprocess
+from PIL import Image
+LENGTH=512 #length of the square area displaying/editing images
+TRANSPARENCY = 150 # transparency of the mask in display
+def add_mask(mask_np_list_updated, mask_label_list):
+    mask_new = np.zeros_like(mask_np_list_updated[0])
+    mask_np_list_updated.append(mask_new)
+    mask_label_list.append("new")
+    return mask_np_list_updated, mask_label_list
+def create_segmentation(mask_np_list):
+    viridis = matplotlib.pyplot.get_cmap(name = 'viridis', lut = len(mask_np_list))
+    segmentation = 0
+    for i, m  in enumerate(mask_np_list):
+        color = matplotlib.colors.to_rgb(viridis(i))
+        color_mat = np.ones_like(m)
+        color_mat = np.stack([color_mat*color[0], color_mat*color[1],color_mat*color[2] ], axis = 2)
+        color_mat = color_mat * m[:,:,np.newaxis]
+        segmentation += color_mat
+    segmentation = Image.fromarray(np.uint8(segmentation*255))
+    return segmentation
+def load_mask_ui(input_folder,load_edit = False):
+    if not load_edit:
+        mask_list, mask_label_list = load_mask(input_folder)
+    else:
+        mask_list, mask_label_list = load_mask_edit(input_folder)
+    mask_np_list = []
+    for  m  in mask_list:
+        mask_np_list. append( m.cpu().numpy())
+    return mask_np_list, mask_label_list
+def load_image_ui(input_folder, load_edit):
+    try:
+        for img_path in Path(input_folder).iterdir():
+            if img_path.name in ["img.png", "img_1024.png", "img_512.png"]:
+                image = Image.open(img_path)
+        mask_np_list, mask_label_list = load_mask_ui(input_folder, load_edit = load_edit)
+        image = image.convert('RGB')
+        segmentation = create_segmentation(mask_np_list)
+        return image, segmentation, mask_np_list, mask_label_list, image
+    except:
+        print("Image folder invalid: The folder should contain image.png")
+        return None, None, None, None, None
+def run_segmentation(input_folder):
+    subprocess.run(["python", "segment.py" , "--name={}".format(input_folder)])
+    return
+def run_edit_text(
+        input_folder,
+        num_tokens,
+        num_sampling_steps,
+        strength,
+        edge_thickness,
+        tgt_prompt,
+        tgt_idx,
+        guidance_scale
+    ):
+    subprocess.run(["python",
+                    "main.py" ,
+                    "--text",
+                    "--name={}".format(input_folder),
+                    "--dpm={}".format("sd"),
+                    "--resolution={}".format(512),
+                    "--load_trained",
+                    "--num_tokens={}".format(num_tokens),
+                    "--seed={}".format(2024),
+                    "--guidance_scale={}".format(guidance_scale),
+                    "--num_sampling_step={}".format(num_sampling_steps),
+                    "--strength={}".format(strength),
+                    "--edge_thickness={}".format(edge_thickness),
+                    "--num_imgs={}".format(2),
+                    "--tgt_prompt={}".format(tgt_prompt) ,
+                    "--tgt_index={}".format(tgt_idx)
+    ])
+    return Image.open(os.path.join(input_folder, "text", "out_text_0.png"))
+def run_optimization(
+        input_folder,
+        num_tokens,
+        embedding_learning_rate,
+        max_emb_train_steps,
+        diffusion_model_learning_rate,
+        max_diffusion_train_steps,
+        train_batch_size,
+        gradient_accumulation_steps
+    ):
+    subprocess.run(["python",
+                    "main.py" ,
+                    "--name={}".format(input_folder),
+                    "--dpm={}".format("sd"),
+                    "--resolution={}".format(512),
+                    "--num_tokens={}".format(num_tokens),
+                    "--embedding_learning_rate={}".format(embedding_learning_rate),
+                    "--diffusion_model_learning_rate={}".format(diffusion_model_learning_rate),
+                    "--max_emb_train_steps={}".format(max_emb_train_steps),
+                    "--max_diffusion_train_steps={}".format(max_diffusion_train_steps),
+                    "--train_batch_size={}".format(train_batch_size),
+                    "--gradient_accumulation_steps={}".format(gradient_accumulation_steps)
+    ])
+    return
+def transparent_paste_with_mask(backimg, foreimg, mask_np,transparency = 128):
+    backimg_solid_np =  np.array(backimg)
+    bimg = backimg.copy()
+    fimg = foreimg.copy()
+    fimg.putalpha(transparency)
+    bimg.paste(fimg, (0,0), fimg)
+    bimg_np = np.array(bimg)
+    mask_np = mask_np[:,:,np.newaxis]
+    try:
+        new_img_np = bimg_np*mask_np + (1-mask_np)* backimg_solid_np
+        return Image.fromarray(new_img_np)
+    except:
+        import pdb; pdb.set_trace()
+def show_segmentation(image, segmentation, flag):
+    if flag is False:
+        flag = True
+        mask_np = np.ones([image.size[0],image.size[1]]).astype(np.uint8)
+        image_edit = transparent_paste_with_mask(image, segmentation, mask_np ,transparency = TRANSPARENCY)
+        return image_edit, flag
+    else:
+        flag = False
+        return image,flag
+def edit_mask_add(canvas,  image, idx, mask_np_list):
+    mask_sel = mask_np_list[idx]
+    mask_new = np.uint8(canvas["mask"][:, :, 0]/ 255.)
+    mask_np_list_updated = []
+    for midx, m  in enumerate(mask_np_list):
+        if midx == idx:
+            mask_np_list_updated.append(mask_union(mask_sel, mask_new))
+        else:
+            mask_np_list_updated.append(m)
+    priority_list = [0 for _ in range(len(mask_np_list_updated))]
+    priority_list[idx] = 1
+    mask_np_list_updated = process_mask_to_follow_priority(mask_np_list_updated, priority_list)
+    mask_ones = np.ones([mask_sel.shape[0], mask_sel.shape[1]]).astype(np.uint8)
+    segmentation = create_segmentation(mask_np_list_updated)
+    image_edit = transparent_paste_with_mask(image, segmentation, mask_ones ,transparency = TRANSPARENCY)
+    return mask_np_list_updated, image_edit
+def slider_release(index, image,  mask_np_list_updated, mask_label_list):
+    if index > len(mask_np_list_updated):
+        return image, "out of range"
+    else:
+        mask_np = mask_np_list_updated[index]
+        mask_label = mask_label_list[index]
+        segmentation = create_segmentation(mask_np_list_updated)
+        new_image = transparent_paste_with_mask(image, segmentation, mask_np, transparency = TRANSPARENCY)
+    return new_image, mask_label
+def save_as_orig_mask(mask_np_list_updated, mask_label_list, input_folder):
+    try:
+        assert np.all(sum(mask_np_list_updated)==1)
+    except:
+        print("please check mask")
+        # plt.imsave( "out_mask.png", mask_list_edit[0])
+        import pdb; pdb.set_trace()
+    for midx, (mask, mask_label) in enumerate(zip(mask_np_list_updated, mask_label_list)):
+        # np.save(os.path.join(input_folder, "maskEDIT{}_{}.npy".format(midx, mask_label)),mask )
+        np.save(os.path.join(input_folder, "mask{}_{}.npy".format(midx, mask_label)),mask )
+    savepath = os.path.join(input_folder, "seg_current.png")
+    visualize_mask_list_clean(mask_np_list_updated, savepath)
+def save_as_edit_mask(mask_np_list_updated, mask_label_list, input_folder):
+    try:
+        assert np.all(sum(mask_np_list_updated)==1)
+    except:
+        print("please check mask")
+        # plt.imsave( "out_mask.png", mask_list_edit[0])
+        import pdb; pdb.set_trace()
+    for midx, (mask, mask_label) in enumerate(zip(mask_np_list_updated, mask_label_list)):
+        np.save(os.path.join(input_folder, "maskEdited{}_{}.npy".format(midx, mask_label)), mask)
+    savepath = os.path.join(input_folder, "seg_edited.png")
+    visualize_mask_list_clean(mask_np_list_updated, savepath)
+with gr.Blocks() as demo:
+    image = gr.State() # store mask
+    image_loaded = gr.State()
+    segmentation    = gr.State()
+    mask_np_list    = gr.State([])
+    mask_label_list = gr.State([])
+    mask_np_list_updated = gr.State([])
+    true    = gr.State(True)
+    false    = gr.State(False)
+    with gr.Row():
+        gr.Markdown("""# D-Edit""")
+    with gr.Tab(label="1 Edit mask"):
+        with gr.Row():
+            with gr.Column():
+                canvas = gr.Image(value = None, type="numpy", tool="sketch", label="Draw Mask", show_label=True, height=LENGTH, width=LENGTH, interactive=True)
+                input_folder = gr.Textbox(value="example1", label="input folder", interactive= True, )
+                segment_button  = gr.Button("1.1 Run segmentation")
+                segment_button.click(run_segmentation,
+                        [input_folder] ,
+                        [] )
+                text_button  = gr.Button("1.2 Load original masks")
+                text_button.click(load_image_ui,
+                        [input_folder, false] ,
+                        [image_loaded, segmentation,  mask_np_list, mask_label_list, canvas] )
+                load_edit_button = gr.Button("1.2 Load edited masks")
+                load_edit_button.click(load_image_ui,
+                        [input_folder, true] ,
+                        [image_loaded, segmentation,  mask_np_list, mask_label_list, canvas] )
+                show_segment = gr.Checkbox(label = "Show Segmentation")
+                flag = gr.State(False)
+                show_segment.select(show_segmentation,
+                                    [image_loaded, segmentation, flag],
+                                    [canvas, flag])
+            mask_np_list_updated = copy.deepcopy(mask_np_list)
+            with gr.Column():
+                gr.Markdown("""<p style="text-align: center; font-size: 20px">Draw Mask</p>""")
+                slider =  gr.Slider(0, 20, step=1,  interactive=True)
+                label = gr.Textbox()
+                slider.release(slider_release,
+                        inputs = [slider, image_loaded,   mask_np_list_updated, mask_label_list],
+                        outputs= [canvas, label]
+                    )
+                add_button  = gr.Button("Add")
+                add_button.click( edit_mask_add,
+                        [canvas, image_loaded, slider, mask_np_list_updated] ,
+                        [mask_np_list_updated, canvas]
+                    )
+                save_button2  = gr.Button("Set and Save as edited masks")
+                save_button2.click( save_as_edit_mask,
+                        [mask_np_list_updated,  mask_label_list, input_folder] ,
+                        [] )
+                save_button  = gr.Button("Set and Save as original masks")
+                save_button.click( save_as_orig_mask,
+                        [mask_np_list_updated,  mask_label_list, input_folder] ,
+                        [] )
+                back_button  = gr.Button("Back to current seg")
+                back_button.click( load_mask_ui,
+                                [input_folder] ,
+                                [ mask_np_list_updated,mask_label_list] )
+                add_mask_button = gr.Button("Add new empty mask")
+                add_mask_button.click(add_mask,
+                        [mask_np_list_updated, mask_label_list] ,
+                        [mask_np_list_updated, mask_label_list] )
+    with gr.Tab(label="2 Optimization"):
+        with gr.Row():
+            with gr.Column():
+                canvas_opt = gr.Image(value = canvas.value, type="pil", tool="sketch", label="Loaded Image", show_label=True, height=LENGTH, width=LENGTH, interactive=True)
+            with gr.Column():
+                gr.Markdown("""<p style="text-align: center; font-size: 20px">Optimization settings (SD)</p>""")
+                num_tokens = gr.Textbox(value="5", label="num tokens to represent each object", interactive= True)
+                embedding_learning_rate = gr.Textbox(value="1e-4", label="Embedding optimization: Learning rate", interactive= True )
+                max_emb_train_steps =  gr.Textbox(value="500", label="embedding optimization: Training steps", interactive= True )
+                diffusion_model_learning_rate = gr.Textbox(value="5e-5", label="UNet Optimization: Learning rate", interactive= True )
+                max_diffusion_train_steps = gr.Textbox(value="500", label="UNet Optimization: Learning rate: Training steps", interactive= True )
+                train_batch_size = gr.Textbox(value="5", label="Batch size", interactive= True )
+                gradient_accumulation_steps=gr.Textbox(value="5", label="Gradient accumulation", interactive= True )
+                add_button  = gr.Button("Run optimization")
+                add_button.click(run_optimization,
+                        inputs = [
+                            input_folder,
+                            num_tokens,
+                            embedding_learning_rate,
+                            max_emb_train_steps,
+                            diffusion_model_learning_rate,
+                            max_diffusion_train_steps,
+                            train_batch_size,gradient_accumulation_steps
+                        ],
+                        outputs = []
+                )
+    with gr.Tab(label="3 Editing"):
+        with gr.Tab(label="3.1 Text-based editing"):
+            canvas_text_edit =  gr.State() # store mask
+            with gr.Row():
+                with gr.Column():
+                    canvas_text_edit = gr.Image(value = None, label="Editing results", show_label=True, height=LENGTH, width=LENGTH)
+                    # canvas_text_edit = gr.Gallery(label = "Edited results")
+                with gr.Column():
+                    gr.Markdown("""<p style="text-align: center; font-size: 20px">Editing setting (SD)</p>""")
+                    tgt_prompt =  gr.Textbox(value="Dog", label="Editing: Text prompt", interactive= True )
+                    tgt_idx = gr.Textbox(value="0", label="Editing: Object index", interactive= True )
+                    guidance_scale = gr.Textbox(value="6", label="Editing: CFG guidance scale", interactive= True )
+                    num_sampling_steps = gr.Textbox(value="50", label="Editing: Sampling steps", interactive= True )
+                    edge_thickness = gr.Textbox(value="10", label="Editing: Edge thickness", interactive= True )
+                    strength = gr.Textbox(value="0.5", label="Editing: Mask strength", interactive= True )
+                    add_button  = gr.Button("Run Editing")
+                    add_button.click(run_edit_text,
+                        inputs = [
+                            input_folder,
+                            num_tokens,
+                            num_sampling_steps,
+                            strength,
+                            edge_thickness,
+                            tgt_prompt,
+                            tgt_idx,
+                            guidance_scale
+                        ],
+                        outputs = [canvas_text_edit]
+                    )
+demo.queue().launch(share=True, debug=True)

assets/demo1.gif ADDED Viewed

assets/demo2.gif ADDED Viewed

assets/demo3.gif ADDED Viewed

assets/demo4.gif ADDED Viewed

assets/mask_def.png ADDED Viewed

controller.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import numpy as np
+import torch
+import math
+import xformers
+class DummyController:
+    def __call__(self, *args):
+        return args[0]
+    def __init__(self):
+        self.num_att_layers = 0
+class GroupedCAController:
+    def __init__(self, mask_list = None):
+        self.mask_list = mask_list
+        if self.mask_list is None:
+            self.is_decom = False
+        else:
+            self.is_decom = True
+    def mask_img_to_mask_vec(self, mask, length):
+        mask_vec = torch.nn.functional.interpolate(mask.unsqueeze(0).unsqueeze(0), (length, length)).squeeze()
+        mask_vec = mask_vec.flatten()
+        return mask_vec
+    def ca_forward_decom(self, q,  k_list,  v_list,  scale, place_in_unet):
+            # attn [Bh, N,   d ]
+            #      [8, 4096, 77]
+            # q  [Bh, N,   d] [8, 4096, 40]    [8, 1024, 80]   [8, 256,160]    [8, 64, 160]
+            # k  [Bh, P,   d] [8, 77  , 40]    [8, 77,   80]   [8, 77, 160]    [8, 77, 160]
+            # v  [Bh, P,   d] [8, 77  , 40]    [8, 77,   80]   [8, 77, 160]    [8, 77, 160]
+        N = q.shape[1]
+        mask_vec_list = []
+        for mask in self.mask_list:
+            mask_vec = self.mask_img_to_mask_vec(mask,  int(math.sqrt(N)))   # [1,N,1]
+            mask_vec = mask_vec.unsqueeze(0).unsqueeze(-1)
+            mask_vec_list.append(mask_vec)
+        out = 0
+        for mask_vec, k, v in zip(mask_vec_list, k_list, v_list):
+            sim = torch.einsum("b i d, b j d -> b i j", q, k) * scale   # [8, 4096, 20]
+            attn = sim.softmax(dim=-1)                            # [Bh,N,P] [8,4096,20]
+            attn = attn.masked_fill(mask_vec==0, 0)
+            masked_out = torch.einsum("b i j, b j d -> b i d", attn, v) # [Bh,N,d] [8,4096,320/h]
+            # mask_vec_inf = torch.where(mask_vec>0, 0,   torch.finfo(k.dtype).min)
+            # masked_out1 = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=mask_vec_inf, op=None, scale=scale)
+            out += masked_out
+        return out
+def reshape_heads_to_batch_dim(self):
+    def func(tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.num_heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
+    return func
+def reshape_batch_dim_to_heads(self):
+    def func(tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.num_heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+    return func
+def register_attention_disentangled_control(unet, controller):
+    def ca_forward(self, place_in_unet):
+        to_out = self.to_out
+        if type(to_out) is torch.nn.modules.container.ModuleList:
+            to_out = self.to_out[0]
+        else:
+            to_out = self.to_out
+        def forward(x, encoder_hidden_states =None, attention_mask=None):
+            if isinstance(controller, DummyController):  # SA CA full
+                q = self.to_q(x)
+                is_cross = encoder_hidden_states is not None
+                encoder_hidden_states = encoder_hidden_states if is_cross else x
+                k = self.to_k(encoder_hidden_states)
+                v = self.to_v(encoder_hidden_states)
+                q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+                v = self.head_to_batch_dim(v)
+                # sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
+                # attn = sim.softmax(dim=-1)
+                # attn = controller(attn, is_cross, place_in_unet)
+                # out = torch.einsum("b i j, b j d -> b i d", attn, v)
+                out = xformers.ops.memory_efficient_attention(
+                    q, k, v, attn_bias=None, op=None, scale=self.scale
+                )
+                out = self.batch_to_head_dim(out)
+            else: # decom: CA+SA
+                is_cross = encoder_hidden_states is not None
+                assert is_cross is not None
+                encoder_hidden_states_list = encoder_hidden_states if is_cross else x
+                q = self.to_q(x)
+                q = self.head_to_batch_dim(q) # [Bh, 4096, 320/h ] h: 8
+                if is_cross:  #CA
+                    k_list = []
+                    v_list = []
+                    assert type(encoder_hidden_states_list) is list
+                    for encoder_hidden_states in encoder_hidden_states_list:
+                        k = self.to_k(encoder_hidden_states)
+                        k = self.head_to_batch_dim(k) # [Bh, 77,   320/h ]
+                        k_list.append(k)
+                        v = self.to_v(encoder_hidden_states)
+                        v = self.head_to_batch_dim(v) # [Bh, 77,   320/h ]
+                        v_list.append(v)
+                    out = controller.ca_forward_decom(q, k_list, v_list, self.scale, place_in_unet)   # [Bh,N,d]
+                    out = self.batch_to_head_dim(out)
+                else:   # SA
+                    exit("decomposing SA!")
+                    k = self.to_k(x)
+                    v = self.to_v(x)
+                    k = self.head_to_batch_dim(k) # [Bh, 77,   320/h ]
+                    v = self.head_to_batch_dim(v) # [Bh, 77,   320/h ]
+                    import pdb; pdb.set_trace()
+                    if  k.shape[1] <= 1024 ** 2:
+                        out = controller.sa_forward(q, k, v, self.scale, place_in_unet)   # [Bh,N,d]
+                    else:
+                        print("warining")
+                        out = controller.sa_forward_decom(q, k, v, self.scale, place_in_unet)   # [Bh,N,d]
+                    # sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
+                    # attn = sim.softmax(dim=-1)             # [8,4096,4096]   [Bh,N,N]
+                    # out = torch.einsum("b i j, b j d -> b i d", attn, v) #  [Bh,N,d] [8,4096,320/h]
+                    out = self.batch_to_head_dim(out)   # [B, H, N, D]
+            return to_out(out)
+        return forward
+    if controller is None:
+        controller = DummyController()
+    def register_recr(net_, count, place_in_unet):
+        if net_.__class__.__name__ == 'Attention' and net_.to_k.in_features == unet.ca_dim:
+            net_.forward = ca_forward(net_, place_in_unet)
+            return count + 1
+        elif hasattr(net_, 'children'):
+            for net__ in net_.children():
+                count = register_recr(net__, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    sub_nets = unet.named_children()
+    for net in sub_nets:
+        if "down" in net[0]:
+            down_count = register_recr(net[1], 0, "down")#6
+            cross_att_count += down_count
+        elif "up" in net[0]:
+            up_count = register_recr(net[1], 0, "up")    #9
+            cross_att_count += up_count
+        elif "mid" in net[0]:
+            mid_count = register_recr(net[1], 0, "mid")  #1
+            cross_att_count += mid_count
+    controller.num_att_layers = cross_att_count

example2/img.png ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import os
+import torch
+import numpy as np
+import argparse
+from peft import LoraConfig
+from pipeline_dedit_sdxl import DEditSDXLPipeline
+from pipeline_dedit_sd import DEditSDPipeline
+from utils import load_image, load_mask, load_mask_edit
+from utils_mask import process_mask_move_torch, process_mask_remove_torch, mask_union_torch, mask_substract_torch, create_outer_edge_mask_torch
+from utils_mask import check_mask_overlap_torch, check_cover_all_torch, visualize_mask_list, get_mask_difference_torch, save_mask_list_to_npys
+parser = argparse.ArgumentParser()
+parser.add_argument("--name",  type=str,required=True, default=None)
+parser.add_argument("--name_2", type=str,required=False, default=None)
+parser.add_argument("--dpm",   type=str,required=True, default="sd")
+parser.add_argument("--resolution",  type=int, default=1024)
+parser.add_argument("--seed",  type=int, default=42)
+parser.add_argument("--embedding_learning_rate",  type=float, default=1e-4)
+parser.add_argument("--max_emb_train_steps",  type=int, default=200)
+parser.add_argument("--diffusion_model_learning_rate", type=float, default=5e-5)
+parser.add_argument("--max_diffusion_train_steps", type=int, default=200)
+parser.add_argument("--train_batch_size",  type=int, default=1)
+parser.add_argument("--gradient_accumulation_steps",  type=int, default=1)
+parser.add_argument("--num_tokens",  type=int, default=1)
+parser.add_argument("--load_trained", default=False, action="store_true" )
+parser.add_argument("--num_sampling_steps",  type=int, default=50)
+parser.add_argument("--guidance_scale", type=float, default = 3 )
+parser.add_argument("--strength",  type=float, default=0.8)
+parser.add_argument("--train_full_lora", default=False, action="store_true" )
+parser.add_argument("--lora_rank",  type=int, default=4)
+parser.add_argument("--lora_alpha",  type=int, default=4)
+parser.add_argument("--prompt_auxin_list", nargs="+", type=str, default = None)
+parser.add_argument("--prompt_auxin_idx_list", nargs="+", type=int, default = None)
+# general editing configs
+parser.add_argument("--load_edited_mask", default=False, action="store_true")
+parser.add_argument("--load_edited_processed_mask", default=False, action="store_true")
+parser.add_argument("--edge_thickness", type=int, default=20)
+parser.add_argument("--num_imgs", type=int, default = 1 )
+parser.add_argument('--active_mask_list', nargs="+", type=int)
+parser.add_argument("--tgt_index",  type=int, default=None)
+# recon
+parser.add_argument("--recon", default=False, action="store_true" )
+parser.add_argument("--recon_an_item", default=False, action="store_true" )
+parser.add_argument("--recon_prompt",  type=str, default=None)
+# text-based editing
+parser.add_argument("--text", default=False, action="store_true")
+parser.add_argument("--tgt_prompt",  type=str, default=None)
+# image-based editing
+parser.add_argument("--image", default=False, action="store_true" )
+parser.add_argument("--src_index",  type=int, default=None)
+parser.add_argument("--tgt_name",   type=str, default=None)
+# mask-based move
+parser.add_argument("--move_resize", default=False, action="store_true" )
+parser.add_argument('--tgt_indices_list', nargs="+", type=int)
+parser.add_argument("--delta_x_list", nargs="+", type=int)
+parser.add_argument("--delta_y_list", nargs="+", type=int)
+parser.add_argument("--priority_list", nargs="+", type=int)
+parser.add_argument("--force_mask_remain", type=int, default=None)
+parser.add_argument("--resize_list", nargs="+", type=float)
+# remove
+parser.add_argument("--remove", default=False, action="store_true" )
+parser.add_argument("--load_edited_removemask", default=False, action="store_true")
+args = parser.parse_args()
+torch.cuda.manual_seed_all(args.seed)
+torch.manual_seed(args.seed)
+base_input_folder = "."
+base_output_folder  = "."
+input_folder = os.path.join(base_input_folder, args.name)
+mask_list, mask_label_list = load_mask(input_folder)
+assert mask_list[0].shape[0] == args.resolution, "Segmentation should be done on size {}".format(args.resolution)
+try:
+    image_gt = load_image(os.path.join(input_folder, "img_{}.png".format(args.resolution) ), size = args.resolution)
+except:
+    image_gt = load_image(os.path.join(input_folder, "img_{}.jpg".format(args.resolution) ), size = args.resolution)
+if args.image:
+    input_folder_2 = os.path.join(base_input_folder, args.name_2)
+    mask_list_2, mask_label_list_2 = load_mask(input_folder_2)
+    assert mask_list_2[0].shape[0] == args.resolution, "Segmentation should be done on size {}".format(args.resolution)
+    try:
+        image_gt_2 = load_image(os.path.join(input_folder_2, "img_{}.png".format(args.resolution) ), size = args.resolution)
+    except:
+        image_gt_2 = load_image(os.path.join(input_folder_2, "img_{}.jpg".format(args.resolution) ), size = args.resolution)
+    output_dir = os.path.join(base_output_folder, args.name + "_" + args.name_2)
+    os.makedirs(output_dir, exist_ok = True)
+else:
+    output_dir = os.path.join(base_output_folder, args.name)
+    os.makedirs(output_dir, exist_ok = True)
+if args.dpm == "sd":
+    if args.image:
+        pipe = DEditSDPipeline(mask_list, mask_label_list, mask_list_2, mask_label_list_2, resolution = args.resolution, num_tokens = args.num_tokens)
+    else:
+        pipe = DEditSDPipeline(mask_list, mask_label_list, resolution = args.resolution, num_tokens = args.num_tokens)
+elif args.dpm == "sdxl":
+    if args.image:
+        pipe = DEditSDXLPipeline(mask_list, mask_label_list, mask_list_2, mask_label_list_2, resolution = args.resolution, num_tokens = args.num_tokens)
+    else:
+        pipe = DEditSDXLPipeline(mask_list, mask_label_list, resolution = args.resolution, num_tokens = args.num_tokens)
+else:
+    raise NotImplementedError
+set_string_list = pipe.set_string_list
+if args.prompt_auxin_list is not None:
+    for auxin_idx, auxin_prompt in zip(args.prompt_auxin_idx_list, args.prompt_auxin_list):
+        set_string_list[auxin_idx] = auxin_prompt.replace("*", set_string_list[auxin_idx] )
+print(set_string_list)
+if args.image:
+    set_string_list_2 = pipe.set_string_list_2
+    print(set_string_list_2)
+if args.load_trained:
+    unet_save_path = os.path.join(output_dir, "unet.pt")
+    unet_state_dict = torch.load(unet_save_path)
+    text_encoder1_save_path = os.path.join(output_dir, "text_encoder1.pt")
+    text_encoder1_state_dict = torch.load(text_encoder1_save_path)
+    if args.dpm == "sdxl":
+        text_encoder2_save_path = os.path.join(output_dir, "text_encoder2.pt")
+        text_encoder2_state_dict = torch.load(text_encoder2_save_path)
+    if 'lora' in ''.join(unet_state_dict.keys()):
+        unet_lora_config = LoraConfig(
+                r=args.lora_rank,
+                lora_alpha=args.lora_alpha,
+                init_lora_weights="gaussian",
+                target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+            )
+        pipe.unet.add_adapter(unet_lora_config)
+    pipe.unet.load_state_dict(unet_state_dict)
+    pipe.text_encoder.load_state_dict(text_encoder1_state_dict)
+    if args.dpm == "sdxl":
+        pipe.text_encoder_2.load_state_dict(text_encoder2_state_dict)
+else:
+    if args.image:
+        pipe.mask_list = [m.cuda() for m in pipe.mask_list]
+        pipe.mask_list_2 = [m.cuda() for m in pipe.mask_list_2]
+        pipe.train_emb_2imgs(
+            image_gt,
+            image_gt_2,
+            set_string_list,
+            set_string_list_2,
+            gradient_accumulation_steps = args.gradient_accumulation_steps,
+            embedding_learning_rate = args.embedding_learning_rate,
+            max_emb_train_steps = args.max_emb_train_steps,
+            train_batch_size = args.train_batch_size,
+        )
+        pipe.train_model_2imgs(
+            image_gt,
+            image_gt_2,
+            set_string_list,
+            set_string_list_2,
+            gradient_accumulation_steps = args.gradient_accumulation_steps,
+            max_diffusion_train_steps = args.max_diffusion_train_steps,
+            diffusion_model_learning_rate = args.diffusion_model_learning_rate ,
+            train_batch_size =args.train_batch_size,
+            train_full_lora = args.train_full_lora,
+            lora_rank = args.lora_rank,
+            lora_alpha = args.lora_alpha
+        )
+    else:
+        pipe.mask_list = [m.cuda() for m in pipe.mask_list]
+        pipe.train_emb(
+            image_gt,
+            set_string_list,
+            gradient_accumulation_steps = args.gradient_accumulation_steps,
+            embedding_learning_rate = args.embedding_learning_rate,
+            max_emb_train_steps = args.max_emb_train_steps,
+            train_batch_size = args.train_batch_size,
+        )
+        pipe.train_model(
+            image_gt,
+            set_string_list,
+            gradient_accumulation_steps = args.gradient_accumulation_steps,
+            max_diffusion_train_steps = args.max_diffusion_train_steps,
+            diffusion_model_learning_rate = args.diffusion_model_learning_rate ,
+            train_batch_size = args.train_batch_size,
+            train_full_lora = args.train_full_lora,
+            lora_rank = args.lora_rank,
+            lora_alpha = args.lora_alpha
+        )
+    unet_save_path = os.path.join(output_dir, "unet.pt")
+    torch.save(pipe.unet.state_dict(),unet_save_path )
+    text_encoder1_save_path = os.path.join(output_dir, "text_encoder1.pt")
+    torch.save(pipe.text_encoder.state_dict(), text_encoder1_save_path)
+    if args.dpm == "sdxl":
+        text_encoder2_save_path = os.path.join(output_dir, "text_encoder2.pt")
+        torch.save(pipe.text_encoder_2.state_dict(), text_encoder2_save_path )
+if args.recon:
+    output_dir = os.path.join(output_dir, "recon")
+    os.makedirs(output_dir, exist_ok = True)
+    if args.recon_an_item:
+        mask_list = [torch.from_numpy(np.ones_like(mask_list[0].numpy()))]
+        tgt_string = set_string_list[args.tgt_index]
+        tgt_string = args.recon_prompt.replace("*", tgt_string)
+        set_string_list = [tgt_string]
+    print(set_string_list)
+    save_path = os.path.join(output_dir, "out_recon.png")
+    x_np = pipe.inference_with_mask(
+        save_path,
+        guidance_scale = args.guidance_scale,
+        num_sampling_steps = args.num_sampling_steps,
+        seed = args.seed,
+        num_imgs = args.num_imgs,
+        set_string_list = set_string_list,
+        mask_list = mask_list
+    )
+if args.text:
+    print("Text-guided editing ")
+    output_dir = os.path.join(output_dir, "text")
+    os.makedirs(output_dir, exist_ok = True)
+    save_path = os.path.join(output_dir, "out_text.png")
+    set_string_list[args.tgt_index] = args.tgt_prompt
+    mask_active = torch.zeros_like(mask_list[0])
+    mask_active = mask_union_torch(mask_active, mask_list[args.tgt_index])
+    if args.active_mask_list is not None:
+        for midx in args.active_mask_list:
+            mask_active = mask_union_torch(mask_active, mask_list[midx])
+    if args.load_edited_mask:
+        mask_list_edited, mask_label_list_edited = load_mask_edit(input_folder)
+        mask_diff = get_mask_difference_torch(mask_list_edited,  mask_list)
+        mask_active = mask_union_torch(mask_active, mask_diff)
+        mask_list = mask_list_edited
+        save_path = os.path.join(output_dir, "out_textEdited.png")
+    mask_hard = mask_substract_torch(torch.ones_like(mask_list[0]), mask_active)
+    mask_soft = create_outer_edge_mask_torch(mask_active, edge_thickness = args.edge_thickness)
+    mask_hard = mask_substract_torch(mask_hard, mask_soft)
+    pipe.inference_with_mask(
+        save_path,
+        orig_image = image_gt,
+        set_string_list = set_string_list,
+        guidance_scale = args.guidance_scale,
+        strength = args.strength,
+        num_imgs = args.num_imgs,
+        mask_hard= mask_hard,
+        mask_soft = mask_soft,
+        mask_list = mask_list,
+        seed = args.seed,
+        num_sampling_steps = args.num_sampling_steps
+    )
+if args.remove:
+    output_dir = os.path.join(output_dir, "remove")
+    save_path = os.path.join(output_dir, "out_remove.png")
+    os.makedirs(output_dir, exist_ok = True)
+    mask_active = torch.zeros_like(mask_list[0])
+    if args.load_edited_mask:
+        mask_list_edited, _ = load_mask_edit(input_folder)
+        mask_diff = get_mask_difference_torch(mask_list_edited,  mask_list)
+        mask_active = mask_union_torch(mask_active, mask_diff)
+        mask_list = mask_list_edited
+    if args.load_edited_processed_mask:
+        # manually edit or draw masks after removing one index, then load
+        mask_list_processed, _ = load_mask_edit(output_dir)
+        mask_remain = get_mask_difference_torch(mask_list_processed, mask_list)
+    else:
+        # generate masks after removing one index, using nearest neighbor algorithm
+        mask_list_processed, mask_remain = process_mask_remove_torch(mask_list, args.tgt_index)
+        save_mask_list_to_npys(output_dir, mask_list_processed, mask_label_list, name = "mask")
+        visualize_mask_list(mask_list_processed, os.path.join(output_dir, "seg_removed.png"))
+    check_cover_all_torch(*mask_list_processed)
+    mask_active = mask_union_torch(mask_active, mask_remain)
+    if args.active_mask_list is not None:
+        for midx in args.active_mask_list:
+            mask_active = mask_union_torch(mask_active, mask_list[midx])
+    mask_hard = 1 - mask_active
+    mask_soft = create_outer_edge_mask_torch(mask_remain, edge_thickness = args.edge_thickness)
+    mask_hard = mask_substract_torch(mask_hard, mask_soft)
+    pipe.inference_with_mask(
+        save_path,
+        orig_image = image_gt,
+        guidance_scale = args.guidance_scale,
+        strength = args.strength,
+        num_imgs = args.num_imgs,
+        mask_hard= mask_hard,
+        mask_soft = mask_soft,
+        mask_list = mask_list_processed,
+        seed = args.seed,
+        num_sampling_steps = args.num_sampling_steps
+    )
+if args.image:
+    output_dir = os.path.join(output_dir, "image")
+    save_path = os.path.join(output_dir, "out_image.png")
+    os.makedirs(output_dir, exist_ok = True)
+    mask_active = torch.zeros_like(mask_list[0])
+    if None not in (args.tgt_name, args.src_index, args.tgt_index):
+        if args.tgt_name == args.name:
+            set_string_list_tgt = set_string_list
+            set_string_list_src = set_string_list_2
+            image_tgt = image_gt
+            if args.load_edited_mask:
+                mask_list_edited, _ = load_mask_edit(input_folder)
+                mask_diff = get_mask_difference_torch(mask_list_edited,  mask_list)
+                mask_active = mask_union_torch(mask_active, mask_diff)
+                mask_list = mask_list_edited
+                save_path = os.path.join(output_dir, "out_imageEdited.png")
+            mask_list_tgt = mask_list
+        elif args.tgt_name == args.name_2:
+            set_string_list_tgt = set_string_list_2
+            set_string_list_src = set_string_list
+            image_tgt = image_gt_2
+            if args.load_edited_mask:
+                mask_list_2_edited, _ = load_mask_edit(input_folder_2)
+                mask_diff = get_mask_difference_torch(mask_list_2_edited,  mask_list_2)
+                mask_active = mask_union_torch(mask_active, mask_diff)
+                mask_list_2 = mask_list_2_edited
+                save_path = os.path.join(output_dir, "out_imageEdited.png")
+            mask_list_tgt = mask_list_2
+        else:
+            exit("tgt_name should be either name or name_2")
+        set_string_list_tgt[args.tgt_index] = set_string_list_src[args.src_index]
+        mask_active = mask_list_tgt[args.tgt_index]
+        mask_frozen = (1-mask_active.float()).to(mask_active.device)
+        mask_soft = create_outer_edge_mask_torch(mask_active.cpu(), edge_thickness = args.edge_thickness)
+        mask_hard = mask_substract_torch(mask_frozen.cpu(), mask_soft.cpu())
+        mask_list_tgt = [m.cuda() for m in mask_list_tgt]
+        pipe.inference_with_mask(
+            save_path,
+            set_string_list = set_string_list_tgt,
+            mask_list = mask_list_tgt,
+            guidance_scale = args.guidance_scale,
+            num_sampling_steps = args.num_sampling_steps,
+            mask_hard = mask_hard.cuda(),
+            mask_soft = mask_soft.cuda(),
+            num_imgs = args.num_imgs,
+            orig_image = image_tgt,
+            strength = args.strength,
+        )
+if args.move_resize:
+    output_dir = os.path.join(output_dir, "move_resize")
+    os.makedirs(output_dir, exist_ok = True)
+    save_path = os.path.join(output_dir, "out_moveresize.png")
+    mask_active = torch.zeros_like(mask_list[0])
+    if args.load_edited_mask:
+        mask_list_edited, _ = load_mask_edit(input_folder)
+        mask_diff = get_mask_difference_torch(mask_list_edited,  mask_list)
+        mask_active = mask_union_torch(mask_active, mask_diff)
+        mask_list = mask_list_edited
+        # save_path = os.path.join(output_dir, "out_moveresizeEdited.png")
+    if args.load_edited_processed_mask:
+        mask_list_processed, _ = load_mask_edit(output_dir)
+        mask_remain = get_mask_difference_torch(mask_list_processed, mask_list)
+    else:
+        mask_list_processed, mask_remain = process_mask_move_torch(
+            mask_list,
+            args.tgt_indices_list,
+            args.delta_x_list,
+            args.delta_y_list, args.priority_list,
+            force_mask_remain = args.force_mask_remain,
+            resize_list = args.resize_list
+        )
+        save_mask_list_to_npys(output_dir, mask_list_processed, mask_label_list, name = "mask")
+        visualize_mask_list(mask_list_processed, os.path.join(output_dir, "seg_move_resize.png"))
+    active_idxs = args.tgt_indices_list
+    mask_active = mask_union_torch(mask_active, *[m for midx, m in enumerate(mask_list_processed) if midx in active_idxs])
+    mask_active = mask_union_torch(mask_remain, mask_active)
+    if args.active_mask_list is not None:
+        for midx in args.active_mask_list:
+            mask_active = mask_union_torch(mask_active, mask_list_processed[midx])
+    mask_frozen =(1 - mask_active.float())
+    mask_soft = create_outer_edge_mask_torch(mask_active, edge_thickness = args.edge_thickness)
+    mask_hard = mask_substract_torch(mask_frozen, mask_soft)
+    check_mask_overlap_torch(mask_hard, mask_soft)
+    pipe.inference_with_mask(
+        save_path,
+        strength = args.strength,
+        orig_image = image_gt,
+        guidance_scale = args.guidance_scale,
+        num_sampling_steps =  args.num_sampling_steps,
+        num_imgs = args.num_imgs,
+        mask_hard= mask_hard,
+        mask_soft = mask_soft,
+        mask_list = mask_list_processed,
+        seed = args.seed
+    )

pipeline_dedit_sd.py ADDED Viewed

	@@ -0,0 +1,813 @@

+import torch
+from utils import import_model_class_from_model_name_or_path
+from transformers import AutoTokenizer
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DDIMScheduler,
+    UNet2DConditionModel,
+)
+from accelerate import Accelerator
+from tqdm.auto import tqdm
+from utils import sd_prepare_input_decom, save_images
+import torch.nn.functional as F
+import itertools
+from peft import LoraConfig
+from controller import GroupedCAController, register_attention_disentangled_control, DummyController
+from utils import  image2latent, latent2image
+import matplotlib.pyplot as plt
+from utils_mask import  check_mask_overlap_torch
+device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+class DEditSDPipeline:
+    def __init__(
+        self,
+        mask_list,
+        mask_label_list,
+        mask_list_2 = None,
+        mask_label_list_2 = None,
+        resolution = 1024,
+        num_tokens = 1
+    ):
+        super().__init__()
+        model_id = "./stable-diffusion-v1-5"
+        self.model_id = model_id
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer", use_fast=False)
+        text_encoder_cls_one = import_model_class_from_model_name_or_path(model_id, subfolder = "text_encoder")
+        self.text_encoder = text_encoder_cls_one.from_pretrained(model_id, subfolder="text_encoder" ).to(device)
+        self.unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
+        self.unet.ca_dim = 768
+        self.vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae")
+        self.scheduler = DDPMScheduler.from_pretrained(model_id , subfolder="scheduler")
+        self.scheduler = DDIMScheduler(
+                            beta_start=0.00085,
+                            beta_end=0.012,
+                            beta_schedule="scaled_linear",
+                            clip_sample=False,
+                            set_alpha_to_one=True,
+                            rescale_betas_zero_snr = False,
+                        )
+        self.mixed_precision = "fp16"
+        self.resolution = resolution
+        self.num_tokens = num_tokens
+        self.mask_list = mask_list
+        self.mask_label_list = mask_label_list
+        notation_token_list = [phrase.split(" ")[-1] for phrase in mask_label_list]
+        placeholder_token_list = ["#"+word+"{}".format(widx) for widx, word in enumerate(notation_token_list)]
+        self.set_string_list, placeholder_token_ids = self.add_tokens(placeholder_token_list)
+        self.min_added_id = min(placeholder_token_ids)
+        self.max_added_id = max(placeholder_token_ids)
+        if mask_list_2 is not None:
+            self.mask_list_2 = mask_list_2
+            self.mask_label_list_2 = mask_label_list_2
+            notation_token_list_2  = [phrase.split(" ")[-1] for phrase in mask_label_list_2]
+            placeholder_token_list_2 = ["$"+word+"{}".format(widx) for widx, word in enumerate(notation_token_list_2)]
+            self.set_string_list_2, placeholder_token_ids_2 = self.add_tokens(placeholder_token_list_2)
+            self.max_added_id = max(placeholder_token_ids_2)
+    def add_tokens_text_encoder_random_init(self, placeholder_token, num_tokens=1):
+        # Add the placeholder token in tokenizer
+        placeholder_tokens = [placeholder_token]
+        # add dummy tokens for multi-vector
+        additional_tokens = []
+        for i in range(1, num_tokens):
+            additional_tokens.append(f"{placeholder_token}_{i}")
+        placeholder_tokens += additional_tokens
+        num_added_tokens = self.tokenizer.add_tokens(placeholder_tokens) # 49408
+        if num_added_tokens != num_tokens:
+            raise ValueError(
+                f"The tokenizer already contains the token {placeholder_token}. Please pass a different"
+                " `placeholder_token` that is not already in the tokenizer."
+            )
+        placeholder_token_ids = self.tokenizer.convert_tokens_to_ids(placeholder_tokens)
+        self.text_encoder.resize_token_embeddings(len(self.tokenizer))
+        token_embeds = self.text_encoder.get_input_embeddings().weight.data
+        std, mean = torch.std_mean(token_embeds)
+        with torch.no_grad():
+            for token_id in placeholder_token_ids:
+                token_embeds[token_id] = torch.randn_like(token_embeds[token_id])*std + mean
+        set_string = " ".join(self.tokenizer.convert_ids_to_tokens(placeholder_token_ids))
+        return set_string, placeholder_token_ids
+    def add_tokens(self, placeholder_token_list):
+        set_string_list = []
+        placeholder_token_ids_list = []
+        for str_idx in range(len(placeholder_token_list)):
+            placeholder_token = placeholder_token_list[str_idx]
+            set_string, placeholder_token_ids = self.add_tokens_text_encoder_random_init(placeholder_token,  num_tokens=self.num_tokens)
+            set_string_list.append(set_string)
+            placeholder_token_ids_list.append(placeholder_token_ids)
+        placeholder_token_ids = list(itertools.chain(*placeholder_token_ids_list))
+        return set_string_list, placeholder_token_ids
+    def train_emb(
+        self,
+        image_gt,
+        set_string_list,
+        gradient_accumulation_steps = 5,
+        embedding_learning_rate = 1e-4,
+        max_emb_train_steps = 100,
+        train_batch_size = 1,
+    ):
+        decom_controller =  GroupedCAController(mask_list = self.mask_list)
+        register_attention_disentangled_control(self.unet, decom_controller)
+        accelerator = Accelerator(mixed_precision=self.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps)
+        self.vae.requires_grad_(False)
+        self.unet.requires_grad_(False)
+        self.text_encoder.requires_grad_(True)
+        self.text_encoder.text_model.encoder.requires_grad_(False)
+        self.text_encoder.text_model.final_layer_norm.requires_grad_(False)
+        self.text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
+        weight_dtype = torch.float32
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        self.unet.to(device, dtype=weight_dtype)
+        self.vae.to(device, dtype=weight_dtype)
+        trainable_embmat_list_1 = [param for param in self.text_encoder.get_input_embeddings().parameters()]
+        optimizer = torch.optim.AdamW(trainable_embmat_list_1, lr=embedding_learning_rate)
+        self.text_encoder, optimizer = accelerator.prepare(self.text_encoder, optimizer)
+        orig_embeds_params_1 = accelerator.unwrap_model(self.text_encoder).get_input_embeddings().weight.data.clone()
+        self.text_encoder.train()
+        effective_emb_train_steps = max_emb_train_steps//gradient_accumulation_steps
+        if accelerator.is_main_process:
+            accelerator.init_trackers("DEdit EmbSteps", config={
+                    "embedding_learning_rate": embedding_learning_rate,
+                    "text_embedding_optimization_steps": effective_emb_train_steps,
+                })
+        global_step = 0
+        noise_scheduler = self.scheduler
+        progress_bar = tqdm(range(0, effective_emb_train_steps), initial = global_step, desc="EmbSteps")
+        latents0 = image2latent(image_gt, vae = self.vae, dtype = weight_dtype)
+        latents0 = latents0.repeat(train_batch_size, 1, 1, 1)
+        for _ in range(max_emb_train_steps):
+            with accelerator.accumulate(self.text_encoder):
+                latents = latents0.clone().detach()
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                encoder_hidden_states_list = sd_prepare_input_decom(
+                    set_string_list,
+                    self.tokenizer,
+                    self.text_encoder,
+                    length = 40,
+                    bsz = train_batch_size,
+                    weight_dtype = weight_dtype
+                )
+                model_pred = self.unet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states = encoder_hidden_states_list,
+                ).sample
+                loss = F.mse_loss(model_pred.float(), noise.float(), reduction="mean")
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+                index_no_updates = torch.ones((len(self.tokenizer),), dtype=torch.bool)
+                index_no_updates[self.min_added_id : self.max_added_id + 1] = False
+                with torch.no_grad():
+                    accelerator.unwrap_model(self.text_encoder).get_input_embeddings().weight[
+                        index_no_updates] = orig_embeds_params_1[index_no_updates]
+            logs = {"loss": loss.detach().item(), "lr": embedding_learning_rate}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+            if global_step >= max_emb_train_steps:
+                break
+        accelerator.wait_for_everyone()
+        accelerator.end_training()
+        self.text_encoder = accelerator.unwrap_model(self.text_encoder).to(dtype = weight_dtype)
+    def train_model(
+        self,
+        image_gt,
+        set_string_list,
+        gradient_accumulation_steps = 5,
+        max_diffusion_train_steps = 100,
+        diffusion_model_learning_rate = 1e-5,
+        train_batch_size = 1,
+        train_full_lora = False,
+        lora_rank = 4,
+        lora_alpha = 4
+    ):
+        self.unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet").to(device)
+        self.unet.ca_dim = 768
+        decom_controller =  GroupedCAController(mask_list = self.mask_list)
+        register_attention_disentangled_control(self.unet, decom_controller)
+        mixed_precision = "fp16"
+        accelerator = Accelerator(gradient_accumulation_steps = gradient_accumulation_steps, mixed_precision = mixed_precision)
+        weight_dtype = torch.float32
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        self.vae.requires_grad_(False)
+        self.vae.to(device, dtype=weight_dtype)
+        self.unet.requires_grad_(False)
+        self.unet.train()
+        self.text_encoder.requires_grad_(False)
+        if not train_full_lora:
+            trainable_params_list = []
+            for _, module in self.unet.named_modules():
+                module_name = type(module).__name__
+                if module_name == "Attention":
+                    if module.to_k.in_features == self.unet.ca_dim: # this is cross attention:
+                        module.to_k.weight.requires_grad = True
+                        trainable_params_list.append(module.to_k.weight)
+                        if module.to_k.bias is not None:
+                            module.to_k.bias.requires_grad = True
+                            trainable_params_list.append(module.to_k.bias)
+                        module.to_v.weight.requires_grad = True
+                        trainable_params_list.append(module.to_v.weight)
+                        if module.to_v.bias is not None:
+                            module.to_v.bias.requires_grad = True
+                            trainable_params_list.append(module.to_v.bias)
+                        module.to_q.weight.requires_grad = True
+                        trainable_params_list.append(module.to_q.weight)
+                        if module.to_q.bias is not None:
+                            module.to_q.bias.requires_grad = True
+                            trainable_params_list.append(module.to_q.bias)
+        else:
+            unet_lora_config = LoraConfig(
+                r=lora_rank,
+                lora_alpha=lora_alpha,
+                init_lora_weights="gaussian",
+                target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+            )
+            self.unet.add_adapter(unet_lora_config)
+            print("training full parameters using lora!")
+            trainable_params_list = list(filter(lambda p: p.requires_grad, self.unet.parameters()))
+        self.text_encoder.to(device, dtype=weight_dtype)
+        optimizer = torch.optim.AdamW(trainable_params_list, lr=diffusion_model_learning_rate)
+        self.unet, optimizer = accelerator.prepare(self.unet, optimizer)
+        psum2 = sum(p.numel() for p in trainable_params_list)
+        effective_diffusion_train_steps = max_diffusion_train_steps // gradient_accumulation_steps
+        if accelerator.is_main_process:
+            accelerator.init_trackers("textual_inversion", config={
+                    "diffusion_model_learning_rate": diffusion_model_learning_rate,
+                    "diffusion_model_optimization_steps": effective_diffusion_train_steps,
+                })
+        global_step = 0
+        progress_bar = tqdm( range(0, effective_diffusion_train_steps),initial=global_step, desc="ModelSteps")
+        noise_scheduler = DDPMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0" , subfolder="scheduler")
+        latents0 = image2latent(image_gt, vae = self.vae, dtype=weight_dtype)
+        latents0 = latents0.repeat(train_batch_size, 1, 1, 1)
+        with torch.no_grad():
+            encoder_hidden_states_list = sd_prepare_input_decom(
+                set_string_list,
+                self.tokenizer,
+                self.text_encoder,
+                length = 40,
+                bsz = train_batch_size,
+                weight_dtype = weight_dtype
+            )
+        for _ in range(max_diffusion_train_steps):
+            with accelerator.accumulate(self.unet):
+                latents = latents0.clone().detach()
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                model_pred = self.unet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states_list,
+                ).sample
+                loss = F.mse_loss(model_pred.float(), noise.float(), reduction="mean")
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+            logs = {"loss": loss.detach().item(), "lr": diffusion_model_learning_rate}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+            if global_step >=max_diffusion_train_steps:
+                break
+        accelerator.wait_for_everyone()
+        accelerator.end_training()
+        self.unet = accelerator.unwrap_model(self.unet).to(dtype = weight_dtype)
+    def train_emb_2imgs(
+        self,
+        image_gt_1,
+        image_gt_2,
+        set_string_list_1,
+        set_string_list_2,
+        gradient_accumulation_steps = 5,
+        embedding_learning_rate = 1e-4,
+        max_emb_train_steps = 100,
+        train_batch_size = 1,
+    ):
+        decom_controller_1 = GroupedCAController(mask_list = self.mask_list)
+        decom_controller_2 = GroupedCAController(mask_list = self.mask_list_2)
+        accelerator = Accelerator(mixed_precision=self.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps)
+        self.vae.requires_grad_(False)
+        self.unet.requires_grad_(False)
+        self.text_encoder.requires_grad_(True)
+        self.text_encoder.text_model.encoder.requires_grad_(False)
+        self.text_encoder.text_model.final_layer_norm.requires_grad_(False)
+        self.text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
+        weight_dtype = torch.float32
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        self.unet.to(device, dtype=weight_dtype)
+        self.vae.to(device, dtype=weight_dtype)
+        trainable_embmat_list_1 = [param for param in self.text_encoder.get_input_embeddings().parameters()]
+        optimizer = torch.optim.AdamW(trainable_embmat_list_1, lr=embedding_learning_rate)
+        self.text_encoder, optimizer= accelerator.prepare(self.text_encoder,  optimizer)  ###
+        orig_embeds_params_1 = accelerator.unwrap_model(self.text_encoder)  .get_input_embeddings().weight.data.clone()
+        self.text_encoder.train()
+        effective_emb_train_steps = max_emb_train_steps//gradient_accumulation_steps
+        if accelerator.is_main_process:
+            accelerator.init_trackers("EmbFt", config={
+                    "embedding_learning_rate": embedding_learning_rate,
+                    "text_embedding_optimization_steps": effective_emb_train_steps,
+                })
+        global_step = 0
+        noise_scheduler = DDPMScheduler.from_pretrained(self.model_id , subfolder="scheduler")
+        progress_bar = tqdm(range(0, effective_emb_train_steps),initial=global_step,desc="EmbSteps")
+        latents0_1 = image2latent(image_gt_1, vae = self.vae, dtype=weight_dtype)
+        latents0_1 = latents0_1.repeat(train_batch_size,1,1,1)
+        latents0_2 = image2latent(image_gt_2, vae = self.vae, dtype=weight_dtype)
+        latents0_2 = latents0_2.repeat(train_batch_size,1,1,1)
+        for step in range(max_emb_train_steps):
+            with accelerator.accumulate(self.text_encoder):
+                latents_1 = latents0_1.clone().detach()
+                noise_1 = torch.randn_like(latents_1)
+                latents_2 = latents0_2.clone().detach()
+                noise_2 = torch.randn_like(latents_2)
+                bsz = latents_1.shape[0]
+                timesteps_1 = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents_1.device)
+                timesteps_1 = timesteps_1.long()
+                noisy_latents_1 = noise_scheduler.add_noise(latents_1, noise_1, timesteps_1)
+                timesteps_2 = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents_2.device)
+                timesteps_2 = timesteps_2.long()
+                noisy_latents_2 = noise_scheduler.add_noise(latents_2, noise_2, timesteps_2)
+                register_attention_disentangled_control(self.unet, decom_controller_1)
+                encoder_hidden_states_list_1 = sd_prepare_input_decom(
+                    set_string_list_1,
+                    self.tokenizer,
+                    self.text_encoder,
+                    length = 40,
+                    bsz = train_batch_size,
+                    weight_dtype = weight_dtype
+                )
+                model_pred_1 = self.unet(
+                    noisy_latents_1,
+                    timesteps_1,
+                    encoder_hidden_states=encoder_hidden_states_list_1,
+                ).sample
+                register_attention_disentangled_control(self.unet, decom_controller_2)
+                # import pdb; pdb.set_trace()
+                encoder_hidden_states_list_2= sd_prepare_input_decom(
+                    set_string_list_2,
+                    self.tokenizer,
+                    self.text_encoder,
+                    length = 40,
+                    bsz = train_batch_size,
+                    weight_dtype = weight_dtype
+                )
+                model_pred_2 = self.unet(
+                    noisy_latents_2,
+                    timesteps_2,
+                    encoder_hidden_states = encoder_hidden_states_list_2,
+                ).sample
+                loss_1 = F.mse_loss(model_pred_1.float(), noise_1.float(), reduction="mean") /2
+                loss_2 = F.mse_loss(model_pred_2.float(), noise_2.float(), reduction="mean") /2
+                loss = loss_1 + loss_2
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+                index_no_updates = torch.ones((len(self.tokenizer),), dtype=torch.bool)
+                index_no_updates[self.min_added_id : self.max_added_id + 1] = False
+                with torch.no_grad():
+                    accelerator.unwrap_model(self.text_encoder).get_input_embeddings().weight[
+                        index_no_updates] = orig_embeds_params_1[index_no_updates]
+            logs = {"loss": loss.detach().item(), "lr": embedding_learning_rate}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+            if global_step >= max_emb_train_steps:
+                break
+        accelerator.wait_for_everyone()
+        accelerator.end_training()
+        self.text_encoder = accelerator.unwrap_model(self.text_encoder)  .to(dtype = weight_dtype)
+    def train_model_2imgs(
+        self,
+        image_gt_1,
+        image_gt_2,
+        set_string_list_1,
+        set_string_list_2,
+        gradient_accumulation_steps = 5,
+        max_diffusion_train_steps = 100,
+        diffusion_model_learning_rate = 1e-5,
+        train_batch_size = 1,
+        train_full_lora = False,
+        lora_rank = 4,
+        lora_alpha = 4
+    ):
+        self.unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet").to(device)
+        self.unet.ca_dim = 768
+        decom_controller_1 = GroupedCAController(mask_list = self.mask_list)
+        decom_controller_2 = GroupedCAController(mask_list = self.mask_list_2)
+        mixed_precision = "fp16"
+        accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps,mixed_precision=mixed_precision)
+        weight_dtype = torch.float32
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        self.vae.requires_grad_(False)
+        self.vae.to(device, dtype=weight_dtype)
+        self.unet.requires_grad_(False)
+        self.unet.train()
+        self.text_encoder.requires_grad_(False)
+        if not train_full_lora:
+            trainable_params_list = []
+            for name, module in self.unet.named_modules():
+                module_name = type(module).__name__
+                if module_name == "Attention":
+                    if module.to_k.in_features == self.unet.ca_dim: # this is cross attention:
+                        module.to_k.weight.requires_grad = True
+                        trainable_params_list.append(module.to_k.weight)
+                        if module.to_k.bias is not None:
+                            module.to_k.bias.requires_grad = True
+                            trainable_params_list.append(module.to_k.bias)
+                        module.to_v.weight.requires_grad = True
+                        trainable_params_list.append(module.to_v.weight)
+                        if module.to_v.bias is not None:
+                            module.to_v.bias.requires_grad = True
+                            trainable_params_list.append(module.to_v.bias)
+                        module.to_q.weight.requires_grad = True
+                        trainable_params_list.append(module.to_q.weight)
+                        if module.to_q.bias is not None:
+                            module.to_q.bias.requires_grad = True
+                            trainable_params_list.append(module.to_q.bias)
+        else:
+            unet_lora_config = LoraConfig(
+                r = lora_rank,
+                lora_alpha = lora_alpha,
+                init_lora_weights="gaussian",
+                target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+            )
+            self.unet.add_adapter(unet_lora_config)
+            print("training full parameters using lora!")
+            trainable_params_list = list(filter(lambda p: p.requires_grad, self.unet.parameters()))
+        self.text_encoder.to(device, dtype=weight_dtype)
+        optimizer = torch.optim.AdamW(trainable_params_list, lr=diffusion_model_learning_rate)
+        self.unet, optimizer = accelerator.prepare(self.unet, optimizer)
+        psum2 = sum(p.numel() for p in trainable_params_list)
+        effective_diffusion_train_steps = max_diffusion_train_steps // gradient_accumulation_steps
+        if accelerator.is_main_process:
+            accelerator.init_trackers("ModelFt", config={
+                    "diffusion_model_learning_rate": diffusion_model_learning_rate,
+                    "diffusion_model_optimization_steps": effective_diffusion_train_steps,
+                })
+        global_step = 0
+        progress_bar = tqdm(range(0, effective_diffusion_train_steps),initial=global_step, desc="ModelSteps")
+        noise_scheduler = DDPMScheduler.from_pretrained(self.model_id, subfolder="scheduler")
+        latents0_1 = image2latent(image_gt_1, vae = self.vae, dtype=weight_dtype)
+        latents0_1 = latents0_1.repeat(train_batch_size, 1, 1, 1)
+        latents0_2 = image2latent(image_gt_2, vae = self.vae, dtype=weight_dtype)
+        latents0_2 = latents0_2.repeat(train_batch_size,1, 1, 1)
+        with torch.no_grad():
+            encoder_hidden_states_list_1 = sd_prepare_input_decom(
+                set_string_list_1,
+                self.tokenizer,
+                self.text_encoder,
+                length = 40,
+                bsz = train_batch_size,
+                weight_dtype = weight_dtype
+            )
+            encoder_hidden_states_list_2 = sd_prepare_input_decom(
+                set_string_list_2,
+                self.tokenizer,
+                self.text_encoder,
+                length = 40,
+                bsz = train_batch_size,
+                weight_dtype = weight_dtype
+            )
+        for _ in range(max_diffusion_train_steps):
+            with accelerator.accumulate(self.unet):
+                latents_1 = latents0_1.clone().detach()
+                noise_1 = torch.randn_like(latents_1)
+                bsz = latents_1.shape[0]
+                timesteps_1 = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents_1.device)
+                timesteps_1 = timesteps_1.long()
+                noisy_latents_1 = noise_scheduler.add_noise(latents_1, noise_1, timesteps_1)
+                latents_2 = latents0_2.clone().detach()
+                noise_2 = torch.randn_like(latents_2)
+                bsz = latents_2.shape[0]
+                timesteps_2 = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents_2.device)
+                timesteps_2 = timesteps_2.long()
+                noisy_latents_2 = noise_scheduler.add_noise(latents_2, noise_2, timesteps_2)
+                register_attention_disentangled_control(self.unet, decom_controller_1)
+                model_pred_1 = self.unet(
+                    noisy_latents_1,
+                    timesteps_1,
+                    encoder_hidden_states = encoder_hidden_states_list_1,
+                ).sample
+                register_attention_disentangled_control(self.unet, decom_controller_2)
+                model_pred_2 = self.unet(
+                    noisy_latents_2,
+                    timesteps_2,
+                    encoder_hidden_states = encoder_hidden_states_list_2,
+                ).sample
+                loss_1 = F.mse_loss(model_pred_1.float(), noise_1.float(), reduction="mean")
+                loss_2 = F.mse_loss(model_pred_2.float(), noise_2.float(), reduction="mean")
+                loss = loss_1 + loss_2
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+            logs = {"loss": loss.detach().item(), "lr": diffusion_model_learning_rate}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+            if global_step >=max_diffusion_train_steps:
+                break
+        accelerator.wait_for_everyone()
+        accelerator.end_training()
+        self.unet = accelerator.unwrap_model(self.unet).to(dtype = weight_dtype)
+    @torch.no_grad()
+    def backward_zT_to_z0_euler_decom(
+        self,
+        zT,
+        cond_emb_list,
+        uncond_emb=None,
+        guidance_scale = 1,
+        num_sampling_steps = 20,
+        cond_controller = None,
+        uncond_controller = None,
+        mask_hard = None,
+        mask_soft = None,
+        orig_image = None,
+        return_intermediate = False,
+        strength = 1
+    ):
+        latent_cur = zT
+        if uncond_emb is None:
+            uncond_emb = torch.zeros(zT.shape[0], 77, self.unet.ca_dim).to(dtype = zT.dtype, device = zT.device)
+        if mask_soft is not None:
+            init_latents_orig = image2latent(orig_image, self.vae, dtype=self.vae.dtype)
+            length = init_latents_orig.shape[-1]
+            noise = torch.randn_like(init_latents_orig)
+            mask_soft = torch.nn.functional.interpolate(mask_soft.float().unsqueeze(0).unsqueeze(0), (length, length)).to(self.vae.dtype) ###
+        if mask_hard is not None:
+            init_latents_orig = image2latent(orig_image, self.vae, dtype=self.vae.dtype)
+            length = init_latents_orig.shape[-1]
+            noise = torch.randn_like(init_latents_orig)
+            mask_hard = torch.nn.functional.interpolate(mask_hard.float().unsqueeze(0).unsqueeze(0), (length, length)).to(self.vae.dtype) ###
+        intermediate_list = [latent_cur.detach()]
+        for i in tqdm(range(num_sampling_steps)):
+            t = self.scheduler.timesteps[i]
+            latent_input = self.scheduler.scale_model_input(latent_cur, t)
+            register_attention_disentangled_control(self.unet, uncond_controller)
+            noise_pred_uncond = self.unet(
+                                    latent_input,
+                                    t,
+                                    encoder_hidden_states=uncond_emb,
+                                ).sample
+            register_attention_disentangled_control(self.unet, cond_controller)
+            noise_pred_cond = self.unet(
+                                latent_input,
+                                t,
+                                encoder_hidden_states=cond_emb_list,
+                              ).sample
+            noise_pred =  noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            latent_cur = self.scheduler.step(noise_pred, t, latent_cur, generator = None, return_dict=False)[0]
+            if return_intermediate is True:
+                intermediate_list.append(latent_cur)
+            if mask_hard is not None and mask_soft is not None and i <= strength *num_sampling_steps:
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                mask = mask_soft.to(latent_cur.device, latent_cur.dtype) + mask_hard.to(latent_cur.device, latent_cur.dtype)
+                latent_cur = (init_latents_proper * mask) + (latent_cur * (1 - mask))
+            elif mask_hard is not None and mask_soft is not None and i > strength *num_sampling_steps:
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                mask = mask_hard.to(latent_cur.device, latent_cur.dtype)
+                latent_cur = (init_latents_proper * mask) + (latent_cur * (1 - mask))
+            elif mask_hard is None and mask_soft is not None and i <= strength *num_sampling_steps:
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                mask = mask_soft.to(latent_cur.device, latent_cur.dtype)
+                latent_cur = (init_latents_proper * mask) + (latent_cur * (1 - mask))
+            elif mask_hard is None and mask_soft is not None and i > strength *num_sampling_steps:
+                pass
+            elif mask_hard is not None and mask_soft is None:
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                mask = mask_hard.to(latent_cur.dtype)
+                latent_cur = (init_latents_proper * mask) + (latent_cur * (1 - mask))
+            else: # hard and soft are both none
+                pass
+        if return_intermediate is True:
+            return latent_cur, intermediate_list
+        else:
+            return latent_cur
+    @torch.no_grad()
+    def sampling(
+        self,
+        set_string_list,
+        cond_controller = None,
+        uncond_controller = None,
+        guidance_scale = 7,
+        num_sampling_steps = 20,
+        mask_hard = None,
+        mask_soft = None,
+        orig_image = None,
+        strength = 1.,
+        num_imgs = 1,
+        normal_token_id_list = [],
+        seed = 1
+    ):
+        weight_dtype = torch.float16
+        self.scheduler.set_timesteps(num_sampling_steps)
+        self.unet.to(device, dtype=weight_dtype)
+        self.vae.to(device, dtype=weight_dtype)
+        self.text_encoder.to(device, dtype=weight_dtype)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        zT = torch.randn(num_imgs, 4, self.resolution//vae_scale_factor,self.resolution//vae_scale_factor).to(device,dtype=weight_dtype)
+        zT = zT * self.scheduler.init_noise_sigma
+        cond_emb_list = sd_prepare_input_decom(
+                            set_string_list,
+                            self.tokenizer,
+                            self.text_encoder,
+                            length = 40,
+                            bsz = num_imgs,
+                            weight_dtype = weight_dtype,
+                            normal_token_id_list = normal_token_id_list
+                        )
+        z0 = self.backward_zT_to_z0_euler_decom(zT, cond_emb_list,
+                guidance_scale = guidance_scale, num_sampling_steps = num_sampling_steps,
+                cond_controller = cond_controller, uncond_controller = uncond_controller,
+                mask_hard = mask_hard, mask_soft = mask_soft, orig_image = orig_image, strength = strength
+             )
+        x0 = latent2image(z0, vae = self.vae)
+        return x0
+    @torch.no_grad()
+    def inference_with_mask(
+        self,
+        save_path,
+        guidance_scale = 3,
+        num_sampling_steps = 50,
+        strength = 1,
+        mask_soft = None,
+        mask_hard= None,
+        orig_image=None,
+        mask_list = None,
+        num_imgs = 1,
+        seed = 1,
+        set_string_list = None
+    ):
+        if mask_list is not None:
+            mask_list = [m.to(device) for m in mask_list]
+        else:
+            mask_list = self.mask_list
+        if set_string_list is not None:
+            self.set_string_list = set_string_list
+        if mask_hard is not None and mask_soft is not None:
+            check_mask_overlap_torch(mask_hard, mask_soft)
+        null_controller = DummyController()
+        decom_controller = GroupedCAController(mask_list = mask_list)
+        x0 = self.sampling(
+            self.set_string_list,
+            guidance_scale = guidance_scale,
+            num_sampling_steps = num_sampling_steps,
+            strength = strength,
+            cond_controller = decom_controller,
+            uncond_controller = null_controller,
+            mask_soft = mask_soft,
+            mask_hard = mask_hard,
+            orig_image = orig_image,
+            num_imgs = num_imgs,
+            seed = seed
+        )
+        save_images(x0, save_path)
+        return x0

pipeline_dedit_sdxl.py ADDED Viewed

	@@ -0,0 +1,875 @@

+import torch
+from utils import import_model_class_from_model_name_or_path
+from transformers import AutoTokenizer
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from accelerate import Accelerator
+from tqdm.auto import tqdm
+from utils import sdxl_prepare_input_decom, save_images
+import torch.nn.functional as F
+import itertools
+from peft import LoraConfig
+from controller import GroupedCAController, register_attention_disentangled_control, DummyController
+from utils import  image2latent, latent2image
+import matplotlib.pyplot as plt
+from utils_mask import  check_mask_overlap_torch
+device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+max_length = 40
+class DEditSDXLPipeline:
+    def __init__(
+        self,
+        mask_list,
+        mask_label_list,
+        mask_list_2 = None,
+        mask_label_list_2 = None,
+        resolution = 1024,
+        num_tokens = 1
+    ):
+        super().__init__()
+        model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+        self.model_id = model_id
+        self.tokenizer       = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer", use_fast=False)
+        self.tokenizer_2     = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer_2",  use_fast=False)
+        text_encoder_cls_one = import_model_class_from_model_name_or_path(model_id, subfolder = "text_encoder")
+        text_encoder_cls_two = import_model_class_from_model_name_or_path(model_id, subfolder="text_encoder_2")
+        self.text_encoder    = text_encoder_cls_one.from_pretrained(model_id, subfolder="text_encoder" ).to(device)
+        self.text_encoder_2  = text_encoder_cls_two.from_pretrained(model_id, subfolder="text_encoder_2").to(device)
+        self.unet =  UNet2DConditionModel.from_pretrained(model_id, subfolder="unet"  )
+        self.unet.ca_dim = 2048
+        self.vae  = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix")
+        self.scheduler = DDPMScheduler.from_pretrained(model_id , subfolder="scheduler")
+        self.mixed_precision = "fp16"
+        self.resolution = resolution
+        self.num_tokens = num_tokens
+        self.mask_list = mask_list
+        self.mask_label_list = mask_label_list
+        notation_token_list = [phrase.split(" ")[-1] for phrase in mask_label_list]
+        placeholder_token_list = ["#"+word+"{}".format(widx) for widx, word in enumerate(notation_token_list)]
+        self.set_string_list, placeholder_token_ids = self.add_tokens(placeholder_token_list)
+        self.min_added_id = min(placeholder_token_ids)
+        self.max_added_id = max(placeholder_token_ids)
+        if mask_list_2 is not None:
+            self.mask_list_2 = mask_list_2
+            self.mask_label_list_2 = mask_label_list_2
+            notation_token_list_2  = [phrase.split(" ")[-1] for phrase in mask_label_list_2]
+            placeholder_token_list_2 = ["$"+word+"{}".format(widx) for widx, word in enumerate(notation_token_list_2)]
+            self.set_string_list_2, placeholder_token_ids_2 = self.add_tokens(placeholder_token_list_2)
+            self.max_added_id = max(placeholder_token_ids_2)
+    def add_tokens_text_encoder_random_init(self, placeholder_token, num_tokens=1):
+        # Add the placeholder token in tokenizer
+        placeholder_tokens = [placeholder_token]
+        # add dummy tokens for multi-vector
+        additional_tokens = []
+        for i in range(1, num_tokens):
+            additional_tokens.append(f"{placeholder_token}_{i}")
+        placeholder_tokens += additional_tokens
+        num_added_tokens = self.tokenizer.add_tokens(placeholder_tokens) # 49408
+        num_added_tokens = self.tokenizer_2.add_tokens(placeholder_tokens) # 49408
+        if num_added_tokens != num_tokens:
+            raise ValueError(
+                f"The tokenizer already contains the token {placeholder_token}. Please pass a different"
+                " `placeholder_token` that is not already in the tokenizer."
+            )
+        placeholder_token_ids = self.tokenizer.convert_tokens_to_ids(placeholder_tokens)
+        placeholder_token_ids_2 = self.tokenizer_2.convert_tokens_to_ids(placeholder_tokens)
+        assert placeholder_token_ids == placeholder_token_ids_2, "Two text encoders are expected to have same vocabs"
+        self.text_encoder.resize_token_embeddings(len(self.tokenizer))
+        token_embeds = self.text_encoder.get_input_embeddings().weight.data
+        std, mean = torch.std_mean(token_embeds)
+        with torch.no_grad():
+            for token_id in placeholder_token_ids:
+                token_embeds[token_id] = torch.randn_like(token_embeds[token_id])*std + mean
+        self.text_encoder_2.resize_token_embeddings(len(self.tokenizer))
+        token_embeds = self.text_encoder_2.get_input_embeddings().weight.data
+        std, mean = torch.std_mean(token_embeds)
+        with torch.no_grad():
+            for token_id in placeholder_token_ids:
+                token_embeds[token_id] = torch.randn_like(token_embeds[token_id])*std + mean
+        set_string = " ".join(self.tokenizer.convert_ids_to_tokens(placeholder_token_ids))
+        return set_string, placeholder_token_ids
+    def add_tokens(self, placeholder_token_list):
+        set_string_list = []
+        placeholder_token_ids_list = []
+        for str_idx in range(len(placeholder_token_list)):
+            placeholder_token = placeholder_token_list[str_idx]
+            set_string, placeholder_token_ids = self.add_tokens_text_encoder_random_init(placeholder_token,  num_tokens=self.num_tokens)
+            set_string_list.append(set_string)
+            placeholder_token_ids_list.append(placeholder_token_ids)
+        placeholder_token_ids = list(itertools.chain(*placeholder_token_ids_list))
+        return set_string_list, placeholder_token_ids
+    def train_emb(
+        self,
+        image_gt,
+        set_string_list,
+        gradient_accumulation_steps = 5,
+        embedding_learning_rate = 1e-4,
+        max_emb_train_steps = 100,
+        train_batch_size = 1,
+        train_full_lora = False
+    ):
+        decom_controller =  GroupedCAController(mask_list = self.mask_list)
+        register_attention_disentangled_control(self.unet, decom_controller)
+        accelerator = Accelerator(mixed_precision=self.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps)
+        self.vae.requires_grad_(False)
+        self.unet.requires_grad_(False)
+        self.text_encoder.requires_grad_(True)
+        self.text_encoder_2.requires_grad_(True)
+        self.text_encoder.text_model.encoder.requires_grad_(False)
+        self.text_encoder.text_model.final_layer_norm.requires_grad_(False)
+        self.text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
+        self.text_encoder_2.text_model.encoder.requires_grad_(False)
+        self.text_encoder_2.text_model.final_layer_norm.requires_grad_(False)
+        self.text_encoder_2.text_model.embeddings.position_embedding.requires_grad_(False)
+        weight_dtype = torch.float32
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        self.unet.to(device, dtype=weight_dtype)
+        self.vae.to(device, dtype=weight_dtype)
+        trainable_embmat_list_1 = [param for param in self.text_encoder.get_input_embeddings().parameters()]
+        trainable_embmat_list_2 = [param for param in self.text_encoder_2.get_input_embeddings().parameters()]
+        optimizer = torch.optim.AdamW(trainable_embmat_list_1 + trainable_embmat_list_2, lr=embedding_learning_rate)
+        self.text_encoder, self.text_encoder_2, optimizer = accelerator.prepare(self.text_encoder, self.text_encoder_2, optimizer)
+        orig_embeds_params_1 = accelerator.unwrap_model(self.text_encoder)  .get_input_embeddings().weight.data.clone()
+        orig_embeds_params_2 = accelerator.unwrap_model(self.text_encoder_2).get_input_embeddings().weight.data.clone()
+        self.text_encoder.train()
+        self.text_encoder_2.train()
+        effective_emb_train_steps = max_emb_train_steps//gradient_accumulation_steps
+        if accelerator.is_main_process:
+            accelerator.init_trackers("DEdit EmbSteps", config={
+                    "embedding_learning_rate": embedding_learning_rate,
+                    "text_embedding_optimization_steps": effective_emb_train_steps,
+                })
+        global_step = 0
+        noise_scheduler = DDPMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0" , subfolder="scheduler")
+        progress_bar = tqdm(range(0, effective_emb_train_steps), initial = global_step, desc="EmbSteps")
+        latents0 = image2latent(image_gt, vae = self.vae, dtype=weight_dtype)
+        latents0 = latents0.repeat(train_batch_size, 1, 1, 1)
+        for _ in range(max_emb_train_steps):
+            with accelerator.accumulate(self.text_encoder, self.text_encoder_2):
+                latents = latents0.clone().detach()
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                encoder_hidden_states_list, add_text_embeds, add_time_ids = sdxl_prepare_input_decom(
+                    set_string_list,
+                    self.tokenizer,
+                    self.tokenizer_2,
+                    self.text_encoder,
+                    self.text_encoder_2,
+                    length = max_length,
+                    bsz = train_batch_size,
+                    weight_dtype = weight_dtype
+                )
+                model_pred = self.unet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states = encoder_hidden_states_list,
+                    cross_attention_kwargs = None,
+                    added_cond_kwargs={"text_embeds": add_text_embeds, "time_ids": add_time_ids},
+                    return_dict=False
+                )[0]
+                loss = F.mse_loss(model_pred.float(), noise.float(), reduction="mean")
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+                index_no_updates = torch.ones((len(self.tokenizer),), dtype=torch.bool)
+                index_no_updates[self.min_added_id : self.max_added_id + 1] = False
+                with torch.no_grad():
+                    accelerator.unwrap_model(self.text_encoder).get_input_embeddings().weight[
+                        index_no_updates] = orig_embeds_params_1[index_no_updates]
+                index_no_updates = torch.ones((len(self.tokenizer_2),), dtype=torch.bool)
+                index_no_updates[self.min_added_id : self.max_added_id + 1] = False
+                with torch.no_grad():
+                    accelerator.unwrap_model(self.text_encoder_2).get_input_embeddings().weight[
+                        index_no_updates] = orig_embeds_params_2[index_no_updates]
+            logs = {"loss": loss.detach().item(), "lr": embedding_learning_rate}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+            if global_step >= max_emb_train_steps:
+                break
+        accelerator.wait_for_everyone()
+        accelerator.end_training()
+        self.text_encoder   =  accelerator.unwrap_model(self.text_encoder).to(dtype = weight_dtype)
+        self.text_encoder_2 =  accelerator.unwrap_model(self.text_encoder_2).to(dtype = weight_dtype)
+    def train_model(
+        self,
+        image_gt,
+        set_string_list,
+        gradient_accumulation_steps = 5,
+        max_diffusion_train_steps = 100,
+        diffusion_model_learning_rate = 1e-5,
+        train_batch_size = 1,
+        train_full_lora = False,
+        lora_rank = 4,
+        lora_alpha = 4
+    ):
+        self.unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet").to(device)
+        self.unet.ca_dim = 2048
+        decom_controller =  GroupedCAController(mask_list = self.mask_list)
+        register_attention_disentangled_control(self.unet, decom_controller)
+        mixed_precision = "fp16"
+        accelerator = Accelerator(gradient_accumulation_steps = gradient_accumulation_steps, mixed_precision = mixed_precision)
+        weight_dtype = torch.float32
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        self.vae.requires_grad_(False)
+        self.vae.to(device, dtype=weight_dtype)
+        self.unet.requires_grad_(False)
+        self.unet.train()
+        self.text_encoder.requires_grad_(False)
+        self.text_encoder_2.requires_grad_(False)
+        if not train_full_lora:
+            trainable_params_list = []
+            for _, module in self.unet.named_modules():
+                module_name = type(module).__name__
+                if module_name == "Attention":
+                    if module.to_k.in_features == 2048: # this is cross attention:
+                        module.to_k.weight.requires_grad = True
+                        trainable_params_list.append(module.to_k.weight)
+                        if module.to_k.bias is not None:
+                            module.to_k.bias.requires_grad = True
+                            trainable_params_list.append(module.to_k.bias)
+                        module.to_v.weight.requires_grad = True
+                        trainable_params_list.append(module.to_v.weight)
+                        if module.to_v.bias is not None:
+                            module.to_v.bias.requires_grad = True
+                            trainable_params_list.append(module.to_v.bias)
+                        module.to_q.weight.requires_grad = True
+                        trainable_params_list.append(module.to_q.weight)
+                        if module.to_q.bias is not None:
+                            module.to_q.bias.requires_grad = True
+                            trainable_params_list.append(module.to_q.bias)
+        else:
+            unet_lora_config = LoraConfig(
+                r=lora_rank,
+                lora_alpha=lora_alpha,
+                init_lora_weights="gaussian",
+                target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+            )
+            self.unet.add_adapter(unet_lora_config)
+            print("training full parameters using lora!")
+            trainable_params_list = list(filter(lambda p: p.requires_grad, self.unet.parameters()))
+        self.text_encoder.to(device, dtype=weight_dtype)
+        self.text_encoder_2.to(device, dtype=weight_dtype)
+        optimizer = torch.optim.AdamW(trainable_params_list, lr=diffusion_model_learning_rate)
+        self.unet, optimizer = accelerator.prepare(self.unet, optimizer)
+        psum2 = sum(p.numel() for p in trainable_params_list)
+        effective_diffusion_train_steps = max_diffusion_train_steps // gradient_accumulation_steps
+        if accelerator.is_main_process:
+            accelerator.init_trackers("textual_inversion", config={
+                    "diffusion_model_learning_rate": diffusion_model_learning_rate,
+                    "diffusion_model_optimization_steps": effective_diffusion_train_steps,
+                })
+        global_step = 0
+        progress_bar = tqdm( range(0, effective_diffusion_train_steps),initial=global_step, desc="ModelSteps")
+        noise_scheduler = DDPMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0" , subfolder="scheduler")
+        latents0 = image2latent(image_gt, vae = self.vae, dtype=weight_dtype)
+        latents0 = latents0.repeat(train_batch_size, 1, 1, 1)
+        with torch.no_grad():
+            encoder_hidden_states_list, add_text_embeds, add_time_ids = sdxl_prepare_input_decom(
+                set_string_list,
+                self.tokenizer,
+                self.tokenizer_2,
+                self.text_encoder,
+                self.text_encoder_2,
+                length = max_length,
+                bsz = train_batch_size,
+                weight_dtype = weight_dtype
+            )
+        for _ in range(max_diffusion_train_steps):
+            with accelerator.accumulate(self.unet):
+                latents = latents0.clone().detach()
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                model_pred = self.unet(
+                    noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=encoder_hidden_states_list,
+                    cross_attention_kwargs=None, return_dict=False,
+                    added_cond_kwargs={"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                )[0]
+                loss = F.mse_loss(model_pred.float(), noise.float(), reduction="mean")
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+            logs = {"loss": loss.detach().item(), "lr": diffusion_model_learning_rate}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+            if global_step >=max_diffusion_train_steps:
+                break
+        accelerator.wait_for_everyone()
+        accelerator.end_training()
+        self.unet = accelerator.unwrap_model(self.unet).to(dtype = weight_dtype)
+    def train_emb_2imgs(
+        self,
+        image_gt_1,
+        image_gt_2,
+        set_string_list_1,
+        set_string_list_2,
+        gradient_accumulation_steps = 5,
+        embedding_learning_rate = 1e-4,
+        max_emb_train_steps = 100,
+        train_batch_size = 1,
+        train_full_lora = False
+    ):
+        decom_controller_1 = GroupedCAController(mask_list = self.mask_list)
+        decom_controller_2 = GroupedCAController(mask_list = self.mask_list_2)
+        accelerator = Accelerator(mixed_precision=self.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps)
+        self.vae.requires_grad_(False)
+        self.unet.requires_grad_(False)
+        self.text_encoder.requires_grad_(True)
+        self.text_encoder_2.requires_grad_(True)
+        self.text_encoder.text_model.encoder.requires_grad_(False)
+        self.text_encoder.text_model.final_layer_norm.requires_grad_(False)
+        self.text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
+        self.text_encoder_2.text_model.encoder.requires_grad_(False)
+        self.text_encoder_2.text_model.final_layer_norm.requires_grad_(False)
+        self.text_encoder_2.text_model.embeddings.position_embedding.requires_grad_(False)
+        weight_dtype = torch.float32
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        self.unet.to(device, dtype=weight_dtype)
+        self.vae.to(device, dtype=weight_dtype)
+        trainable_embmat_list_1 = [param for param in self.text_encoder.get_input_embeddings().parameters()]
+        trainable_embmat_list_2 = [param for param in self.text_encoder_2.get_input_embeddings().parameters()]
+        optimizer = torch.optim.AdamW(trainable_embmat_list_1 + trainable_embmat_list_2, lr=embedding_learning_rate)
+        self.text_encoder, self.text_encoder_2,  optimizer= accelerator.prepare(self.text_encoder, self.text_encoder_2,  optimizer)  ###
+        orig_embeds_params_1 = accelerator.unwrap_model(self.text_encoder)  .get_input_embeddings().weight.data.clone()
+        orig_embeds_params_2 = accelerator.unwrap_model(self.text_encoder_2).get_input_embeddings().weight.data.clone()
+        self.text_encoder.train()
+        self.text_encoder_2.train()
+        effective_emb_train_steps = max_emb_train_steps//gradient_accumulation_steps
+        if accelerator.is_main_process:
+            accelerator.init_trackers("EmbFt", config={
+                    "embedding_learning_rate": embedding_learning_rate,
+                    "text_embedding_optimization_steps": effective_emb_train_steps,
+                })
+        global_step = 0
+        noise_scheduler = DDPMScheduler.from_pretrained(self.model_id , subfolder="scheduler")
+        progress_bar = tqdm(range(0, effective_emb_train_steps),initial=global_step,desc="EmbSteps")
+        latents0_1 = image2latent(image_gt_1, vae = self.vae, dtype=weight_dtype)
+        latents0_1 = latents0_1.repeat(train_batch_size,1,1,1)
+        latents0_2 = image2latent(image_gt_2, vae = self.vae, dtype=weight_dtype)
+        latents0_2 = latents0_2.repeat(train_batch_size,1,1,1)
+        for step  in range(max_emb_train_steps):
+            with accelerator.accumulate(self.text_encoder, self.text_encoder_2):
+                latents_1 = latents0_1.clone().detach()
+                noise_1 = torch.randn_like(latents_1)
+                latents_2 = latents0_2.clone().detach()
+                noise_2 = torch.randn_like(latents_2)
+                bsz = latents_1.shape[0]
+                timesteps_1 = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents_1.device)
+                timesteps_1 = timesteps_1.long()
+                noisy_latents_1 = noise_scheduler.add_noise(latents_1, noise_1, timesteps_1)
+                timesteps_2 = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents_2.device)
+                timesteps_2 = timesteps_2.long()
+                noisy_latents_2 = noise_scheduler.add_noise(latents_2, noise_2, timesteps_2)
+                register_attention_disentangled_control(self.unet, decom_controller_1)
+                encoder_hidden_states_list_1, add_text_embeds_1, add_time_ids_1 = sdxl_prepare_input_decom(
+                    set_string_list_1,
+                    self.tokenizer,
+                    self.tokenizer_2,
+                    self.text_encoder,
+                    self.text_encoder_2,
+                    length = max_length,
+                    bsz = train_batch_size,
+                    weight_dtype = weight_dtype
+                )
+                model_pred_1 = self.unet(
+                    noisy_latents_1,
+                    timesteps_1,
+                    encoder_hidden_states=encoder_hidden_states_list_1,
+                    cross_attention_kwargs=None,
+                    added_cond_kwargs={"text_embeds": add_text_embeds_1, "time_ids": add_time_ids_1},
+                    return_dict=False
+                )[0]
+                register_attention_disentangled_control(self.unet, decom_controller_2)
+                # import pdb; pdb.set_trace()
+                encoder_hidden_states_list_2, add_text_embeds_2, add_time_ids_2 = sdxl_prepare_input_decom(
+                    set_string_list_2,
+                    self.tokenizer,
+                    self.tokenizer_2,
+                    self.text_encoder,
+                    self.text_encoder_2,
+                    length = max_length,
+                    bsz = train_batch_size,
+                    weight_dtype = weight_dtype
+                )
+                model_pred_2 = self.unet(
+                    noisy_latents_2,
+                    timesteps_2,
+                    encoder_hidden_states = encoder_hidden_states_list_2,
+                    cross_attention_kwargs=None,
+                    added_cond_kwargs={"text_embeds": add_text_embeds_2, "time_ids": add_time_ids_2},
+                    return_dict=False
+                )[0]
+                loss_1 = F.mse_loss(model_pred_1.float(), noise_1.float(), reduction="mean") /2
+                loss_2 = F.mse_loss(model_pred_2.float(), noise_2.float(), reduction="mean") /2
+                loss = loss_1 + loss_2
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+                index_no_updates = torch.ones((len(self.tokenizer),), dtype=torch.bool)
+                index_no_updates[self.min_added_id : self.max_added_id + 1] = False
+                with torch.no_grad():
+                    accelerator.unwrap_model(self.text_encoder).get_input_embeddings().weight[
+                        index_no_updates] = orig_embeds_params_1[index_no_updates]
+                index_no_updates = torch.ones((len(self.tokenizer_2),), dtype=torch.bool)
+                index_no_updates[self.min_added_id : self.max_added_id + 1] = False
+                with torch.no_grad():
+                    accelerator.unwrap_model(self.text_encoder_2).get_input_embeddings().weight[
+                        index_no_updates] = orig_embeds_params_2[index_no_updates]
+            logs = {"loss": loss.detach().item(), "lr": embedding_learning_rate}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+            if global_step >= max_emb_train_steps:
+                break
+        accelerator.wait_for_everyone()
+        accelerator.end_training()
+        self.text_encoder = accelerator.unwrap_model(self.text_encoder)  .to(dtype = weight_dtype)
+        self.text_encoder_2 = accelerator.unwrap_model(self.text_encoder_2).to(dtype = weight_dtype)
+    def train_model_2imgs(
+        self,
+        image_gt_1,
+        image_gt_2,
+        set_string_list_1,
+        set_string_list_2,
+        gradient_accumulation_steps = 5,
+        max_diffusion_train_steps = 100,
+        diffusion_model_learning_rate = 1e-5,
+        train_batch_size = 1,
+        train_full_lora = False,
+        lora_rank = 4,
+        lora_alpha = 4
+    ):
+        self.unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet").to(device)
+        self.unet.ca_dim = 2048
+        decom_controller_1 = GroupedCAController(mask_list = self.mask_list)
+        decom_controller_2 = GroupedCAController(mask_list = self.mask_list_2)
+        mixed_precision = "fp16"
+        accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps,mixed_precision=mixed_precision)
+        weight_dtype = torch.float32
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        self.vae.requires_grad_(False)
+        self.vae.to(device, dtype=weight_dtype)
+        self.unet.requires_grad_(False)
+        self.unet.train()
+        self.text_encoder.requires_grad_(False)
+        self.text_encoder_2.requires_grad_(False)
+        if not train_full_lora:
+            trainable_params_list = []
+            for name, module in self.unet.named_modules():
+                module_name = type(module).__name__
+                if module_name == "Attention":
+                    if module.to_k.in_features == 2048: # this is cross attention:
+                        module.to_k.weight.requires_grad = True
+                        trainable_params_list.append(module.to_k.weight)
+                        if module.to_k.bias is not None:
+                            module.to_k.bias.requires_grad = True
+                            trainable_params_list.append(module.to_k.bias)
+                        module.to_v.weight.requires_grad = True
+                        trainable_params_list.append(module.to_v.weight)
+                        if module.to_v.bias is not None:
+                            module.to_v.bias.requires_grad = True
+                            trainable_params_list.append(module.to_v.bias)
+                        module.to_q.weight.requires_grad = True
+                        trainable_params_list.append(module.to_q.weight)
+                        if module.to_q.bias is not None:
+                            module.to_q.bias.requires_grad = True
+                            trainable_params_list.append(module.to_q.bias)
+        else:
+            unet_lora_config = LoraConfig(
+                r = lora_rank,
+                lora_alpha = lora_alpha,
+                init_lora_weights="gaussian",
+                target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+            )
+            self.unet.add_adapter(unet_lora_config)
+            print("training full parameters using lora!")
+            trainable_params_list = list(filter(lambda p: p.requires_grad, self.unet.parameters()))
+        self.text_encoder.to(device, dtype=weight_dtype)
+        self.text_encoder_2.to(device, dtype=weight_dtype)
+        optimizer = torch.optim.AdamW(trainable_params_list, lr=diffusion_model_learning_rate)
+        self.unet, optimizer = accelerator.prepare(self.unet, optimizer)
+        psum2 = sum(p.numel() for p in trainable_params_list)
+        effective_diffusion_train_steps = max_diffusion_train_steps // gradient_accumulation_steps
+        if accelerator.is_main_process:
+            accelerator.init_trackers("ModelFt", config={
+                    "diffusion_model_learning_rate": diffusion_model_learning_rate,
+                    "diffusion_model_optimization_steps": effective_diffusion_train_steps,
+                })
+        global_step = 0
+        progress_bar = tqdm(range(0, effective_diffusion_train_steps),initial=global_step, desc="ModelSteps")
+        noise_scheduler = DDPMScheduler.from_pretrained(self.model_id, subfolder="scheduler")
+        latents0_1 = image2latent(image_gt_1, vae = self.vae, dtype=weight_dtype)
+        latents0_1 = latents0_1.repeat(train_batch_size, 1, 1, 1)
+        latents0_2 = image2latent(image_gt_2, vae = self.vae, dtype=weight_dtype)
+        latents0_2 = latents0_2.repeat(train_batch_size,1, 1, 1)
+        with torch.no_grad():
+            encoder_hidden_states_list_1, add_text_embeds_1, add_time_ids_1 = sdxl_prepare_input_decom(
+                set_string_list_1,
+                self.tokenizer,
+                self.tokenizer_2,
+                self.text_encoder,
+                self.text_encoder_2,
+                length = max_length,
+                bsz = train_batch_size,
+                weight_dtype = weight_dtype
+            )
+            encoder_hidden_states_list_2, add_text_embeds_2, add_time_ids_2 = sdxl_prepare_input_decom(
+                set_string_list_2,
+                self.tokenizer,
+                self.tokenizer_2,
+                self.text_encoder,
+                self.text_encoder_2,
+                length = max_length,
+                bsz = train_batch_size,
+                weight_dtype = weight_dtype
+            )
+        for _ in range(max_diffusion_train_steps):
+            with accelerator.accumulate(self.unet):
+                latents_1 = latents0_1.clone().detach()
+                noise_1 = torch.randn_like(latents_1)
+                bsz = latents_1.shape[0]
+                timesteps_1 = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents_1.device)
+                timesteps_1 = timesteps_1.long()
+                noisy_latents_1 = noise_scheduler.add_noise(latents_1, noise_1, timesteps_1)
+                latents_2 = latents0_2.clone().detach()
+                noise_2 = torch.randn_like(latents_2)
+                bsz = latents_2.shape[0]
+                timesteps_2 = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents_2.device)
+                timesteps_2 = timesteps_2.long()
+                noisy_latents_2 = noise_scheduler.add_noise(latents_2, noise_2, timesteps_2)
+                register_attention_disentangled_control(self.unet, decom_controller_1)
+                model_pred_1 = self.unet(
+                    noisy_latents_1,
+                    timesteps_1,
+                    encoder_hidden_states = encoder_hidden_states_list_1,
+                    cross_attention_kwargs = None,
+                    return_dict = False,
+                    added_cond_kwargs = {"text_embeds": add_text_embeds_1, "time_ids": add_time_ids_1}
+                )[0]
+                register_attention_disentangled_control(self.unet, decom_controller_2)
+                model_pred_2 = self.unet(
+                    noisy_latents_2,
+                    timesteps_2,
+                    encoder_hidden_states = encoder_hidden_states_list_2,
+                    cross_attention_kwargs = None,
+                    return_dict=False,
+                    added_cond_kwargs={"text_embeds": add_text_embeds_2, "time_ids": add_time_ids_2}
+                )[0]
+                loss_1 = F.mse_loss(model_pred_1.float(), noise_1.float(), reduction="mean")
+                loss_2 = F.mse_loss(model_pred_2.float(), noise_2.float(), reduction="mean")
+                loss = loss_1 + loss_2
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+            logs = {"loss": loss.detach().item(), "lr": diffusion_model_learning_rate}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+            if global_step >=max_diffusion_train_steps:
+                break
+        accelerator.wait_for_everyone()
+        accelerator.end_training()
+        self.unet = accelerator.unwrap_model(self.unet).to(dtype = weight_dtype)
+    @torch.no_grad()
+    def backward_zT_to_z0_euler_decom(
+        self,
+        zT,
+        cond_emb_list,
+        cond_add_text_embeds,
+        add_time_ids,
+        uncond_emb=None,
+        guidance_scale = 1,
+        num_sampling_steps = 20,
+        cond_controller = None,
+        uncond_controller = None,
+        mask_hard = None,
+        mask_soft = None,
+        orig_image = None,
+        return_intermediate = False,
+        strength = 1
+    ):
+        latent_cur = zT
+        if uncond_emb is None:
+            uncond_emb = torch.zeros(zT.shape[0], 77, 2048).to(dtype = zT.dtype, device = zT.device)
+            uncond_add_text_embeds = torch.zeros(1, 1280).to(dtype = zT.dtype, device = zT.device)
+        if mask_soft is not None:
+            init_latents_orig = image2latent(orig_image, self.vae, dtype=self.vae.dtype)
+            length = init_latents_orig.shape[-1]
+            noise = torch.randn_like(init_latents_orig)
+            mask_soft = torch.nn.functional.interpolate(mask_soft.float().unsqueeze(0).unsqueeze(0), (length, length)).to(self.vae.dtype) ###
+        if mask_hard is not None:
+            init_latents_orig = image2latent(orig_image, self.vae, dtype=self.vae.dtype)
+            length = init_latents_orig.shape[-1]
+            noise = torch.randn_like(init_latents_orig)
+            mask_hard = torch.nn.functional.interpolate(mask_hard.float().unsqueeze(0).unsqueeze(0), (length, length)).to(self.vae.dtype) ###
+        intermediate_list = [latent_cur.detach()]
+        for i in tqdm(range(num_sampling_steps)):
+            t = self.scheduler.timesteps[i]
+            latent_input = self.scheduler.scale_model_input(latent_cur, t)
+            register_attention_disentangled_control(self.unet, uncond_controller)
+            noise_pred_uncond = self.unet(latent_input, t,
+                            encoder_hidden_states=uncond_emb,
+                            added_cond_kwargs={"text_embeds": uncond_add_text_embeds, "time_ids": add_time_ids},
+                            return_dict=False,)[0]
+            register_attention_disentangled_control(self.unet, cond_controller)
+            noise_pred_cond = self.unet(latent_input, t,
+                    encoder_hidden_states=cond_emb_list,
+                    added_cond_kwargs={"text_embeds": cond_add_text_embeds, "time_ids": add_time_ids},
+                    return_dict=False,)[0]
+            noise_pred =  noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            latent_cur = self.scheduler.step(noise_pred, t, latent_cur, generator = None, return_dict=False)[0]
+            if return_intermediate is True:
+                intermediate_list.append(latent_cur)
+            if mask_hard is not None and mask_soft is not None and i <= strength *num_sampling_steps:
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                mask = mask_soft.to(latent_cur.device, latent_cur.dtype) + mask_hard.to(latent_cur.device, latent_cur.dtype)
+                latent_cur = (init_latents_proper * mask) + (latent_cur * (1 - mask))
+            elif mask_hard is not None and mask_soft is not None and i > strength *num_sampling_steps:
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                mask = mask_hard.to(latent_cur.device, latent_cur.dtype)
+                latent_cur = (init_latents_proper * mask) + (latent_cur * (1 - mask))
+            elif mask_hard is None and mask_soft is not None and i <= strength *num_sampling_steps:
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                mask = mask_soft.to(latent_cur.device, latent_cur.dtype)
+                latent_cur = (init_latents_proper * mask) + (latent_cur * (1 - mask))
+            elif mask_hard is None and mask_soft is not None and i > strength *num_sampling_steps:
+                pass
+            elif mask_hard is not None and mask_soft is None:
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                mask = mask_hard.to(latent_cur.dtype)
+                latent_cur = (init_latents_proper * mask) + (latent_cur * (1 - mask))
+            else: # hard and soft are both none
+                pass
+        if return_intermediate is True:
+            return latent_cur, intermediate_list
+        else:
+            return latent_cur
+    @torch.no_grad()
+    def sampling(
+        self,
+        set_string_list,
+        cond_controller = None,
+        uncond_controller = None,
+        guidance_scale = 7,
+        num_sampling_steps = 20,
+        mask_hard = None,
+        mask_soft = None,
+        orig_image = None,
+        strength = 1.,
+        num_imgs = 1,
+        normal_token_id_list = [],
+        seed = 1
+    ):
+        weight_dtype = torch.float16
+        self.scheduler.set_timesteps(num_sampling_steps)
+        self.unet.to(device, dtype=weight_dtype)
+        self.vae.to(device, dtype=weight_dtype)
+        self.text_encoder.to(device, dtype=weight_dtype)
+        self.text_encoder_2.to(device, dtype=weight_dtype)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        zT = torch.randn(num_imgs, 4, self.resolution//vae_scale_factor,self.resolution//vae_scale_factor).to(device,dtype=weight_dtype)
+        zT = zT * self.scheduler.init_noise_sigma
+        cond_emb_list, cond_add_text_embeds, add_time_ids = sdxl_prepare_input_decom(
+            set_string_list,
+            self.tokenizer,
+            self.tokenizer_2,
+            self.text_encoder,
+            self.text_encoder_2,
+            length = max_length,
+            bsz = num_imgs,
+            weight_dtype = weight_dtype,
+            normal_token_id_list = normal_token_id_list
+        )
+        z0 = self.backward_zT_to_z0_euler_decom(zT, cond_emb_list, cond_add_text_embeds, add_time_ids,
+                guidance_scale = guidance_scale, num_sampling_steps = num_sampling_steps,
+                cond_controller = cond_controller, uncond_controller = uncond_controller,
+                mask_hard = mask_hard, mask_soft = mask_soft,  orig_image =orig_image, strength = strength
+             )
+        x0 = latent2image(z0, vae = self.vae)
+        return x0
+    @torch.no_grad()
+    def inference_with_mask(
+        self,
+        save_path,
+        guidance_scale = 3,
+        num_sampling_steps = 50,
+        strength = 1,
+        mask_soft = None,
+        mask_hard= None,
+        orig_image=None,
+        mask_list = None,
+        num_imgs = 1,
+        seed = 1,
+        set_string_list = None
+    ):
+        if mask_list is not None:
+            mask_list = [m.to(device) for m in mask_list]
+        else:
+            mask_list = self.mask_list
+        if set_string_list is not None:
+            self.set_string_list = set_string_list
+        if mask_hard is not None and mask_soft is not None:
+            check_mask_overlap_torch(mask_hard, mask_soft)
+        null_controller = DummyController()
+        decom_controller = GroupedCAController(mask_list = mask_list)
+        x0 = self.sampling(
+            self.set_string_list,
+            guidance_scale = guidance_scale,
+            num_sampling_steps = num_sampling_steps,
+            strength = strength,
+            cond_controller = decom_controller,
+            uncond_controller = null_controller,
+            mask_soft = mask_soft,
+            mask_hard = mask_hard,
+            orig_image = orig_image,
+            num_imgs = num_imgs,
+            seed = seed
+        )
+        save_images(x0, save_path)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch==2.2.0
+torchvision==0.17.0
+transformers==4.37.2
+accelerate==0.23.0
+gradio==3.41.1
+xformers==0.0.24
+diffusers==0.26.3
+scipy
+tqdm
+numpy
+safetensors
+peft

scripts/run_segment.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+export IMAGE_NAME="example1"
+# python segment.py --name=$IMAGE_NAME --size=512
+python segment.py --name=$IMAGE_NAME --size=1024

scripts/run_segmentSAM.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ export IMAGE_NAME="example1"
2	+ python segment_sam.py --name=$IMAGE_NAME --text_prompt="bag"

scripts/sd/run_ft_sd_512.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sd" \
+    --resolution=512 \
+    --num_tokens=5 \
+    --embedding_learning_rate=1e-4  \
+    --diffusion_model_learning_rate=5e-5 \
+    --max_emb_train_steps=500  \
+    --max_diffusion_train_steps=500 \
+    --train_batch_size=5 \
+    --gradient_accumulation_steps=5

scripts/sd/run_ft_sd_512_2imgs.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+export IMAGE_NAME="example1"
+export IMAGE_NAME_2="example2"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sd" \
+    --resolution=512 \
+    --image \
+    --name_2=$IMAGE_NAME_2 \
+    --embedding_learning_rate=1e-4 \
+    --diffusion_model_learning_rate=5e-5 \
+    --max_emb_train_steps=500 \
+    --max_diffusion_train_steps=500 \
+    --train_batch_size=5 \
+    --gradient_accumulation_steps=5

scripts/sd/run_image.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+export IMAGE_NAME="example1"
+export IMAGE_NAME_2="example2"
+python main.py --name=$IMAGE_NAME \
+    --name_2=$IMAGE_NAME_2 \
+    --dpm="sd" \
+    --resolution=512 \
+    --image \
+    --load_trained \
+    --guidance_scale=2 \
+    --num_imgs=2 \
+    --seed=2024 \
+    --strength=0.5 \
+    --edge_thickness=10 \
+    --src_index=1   --tgt_index=0  \
+    --tgt_name=$IMAGE_NAME

scripts/sd/run_move_resize.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+export IMAGE_NAME="example1"
+CUDA_VISIBLE_DEVICES=1 python main.py --name=$IMAGE_NAME \
+    --dpm="sd" \
+    --resolution=512 \
+    --num_tokens=5 \
+    --load_trained \
+    --move_resize \
+    --seed=2023 \
+    --num_sampling_step=50 \
+    --strength=0.6 \
+    --edge_thickness=10 \
+    --guidance_scale=2 \
+    --num_imgs=1 \
+    --tgt_indices_list 0 \
+    --active_mask_list 2 \
+    --delta_x 100 --delta_y 60  \
+    --resize_list 0.6 \
+    --priority_list 1

scripts/sd/run_recon.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sd" \
+    --resolution=512 \
+    --num_tokens=5 \
+    --load_trained \
+    --recon \
+    --seed=2024 \
+    --guidance_scale=2 \
+    --num_sampling_step=20 \
+    --num_imgs=1 \

scripts/sd/run_remove.sh ADDED Viewed

	@@ -0,0 +1,55 @@

+# export IMAGE_NAME="example1"
+# CUDA_VISIBLE_DEVICES=1 python main.py --name=$IMAGE_NAME \
+#     --dpm="sd" \
+#     --resolution=512 \
+#     --num_tokens=5 \
+#     --load_trained \
+#     --load_edited_mask \
+#     --remove \
+#     --seed=2023 \
+#     --num_sampling_step=50 \
+#     --strength=0.7 \
+#     --edge_thickness=10 \
+#     --guidance_scale=2 \
+#     --num_imgs=1 \
+#     --tgt_index=0
+# export IMAGE_NAME="example1"
+# CUDA_VISIBLE_DEVICES=1 python main.py --name=$IMAGE_NAME \
+#     --dpm="sd" \
+#     --resolution=512 \
+#     --num_tokens=5 \
+#     --load_trained \
+#     --load_edited_processed_mask \
+#     --remove \
+#     --seed=2024 \
+#     --num_sampling_step=50 \
+#     --strength=0.5 \
+#     --edge_thickness=10 \
+#     --guidance_scale=7 \
+#     --num_imgs=1 \
+#     --tgt_index=2
+export IMAGE_NAME="example1"
+CUDA_VISIBLE_DEVICES=1 python main.py --name=$IMAGE_NAME \
+    --dpm="sd" \
+    --resolution=512 \
+    --num_tokens=5 \
+    --load_trained \
+    --load_edited_processed_mask \
+    --remove \
+    --seed=1 \
+    --num_sampling_step=50 \
+    --strength=0.6 \
+    --edge_thickness=10 \
+    --guidance_scale=7 \
+    --num_imgs=1 \
+    --tgt_index=2

scripts/sd/run_text.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+# export IMAGE_NAME="example1"
+# python main.py --name=$IMAGE_NAME \
+#     --dpm="sd" \
+#     --resolution=512 \
+#     --load_trained \
+#     --text \
+#     --num_tokens=5 \
+#     --seed=2024 \
+#     --guidance_scale=7 \
+#     --num_sampling_step=50 \
+#     --strength=0.7 \
+#     --edge_thickness=15 \
+#     --num_imgs=1 \
+#     --tgt_prompt="a red bag" \
+#     --tgt_index=0
+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sd" \
+    --resolution=512 \
+    --load_trained \
+    --text \
+    --num_tokens=5 \
+    --seed=2024 \
+    --guidance_scale=6 \
+    --num_sampling_step=50 \
+    --strength=0.5 \
+    --edge_thickness=15 \
+    --num_imgs=2 \
+    --tgt_prompt="a black bag" \
+    --tgt_index=0

scripts/sdxl/run_ft_sdxl_1024.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --resolution=1024 \
+    --num_tokens=5 \
+    --embedding_learning_rate=1e-4  \
+    --diffusion_model_learning_rate=5e-5 \
+    --max_emb_train_steps=500  \
+    --max_diffusion_train_steps=500 \
+    --train_batch_size=2 \
+    --gradient_accumulation_steps=5

scripts/sdxl/run_ft_sdxl_1024_2imgs.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+export IMAGE_NAME="example1"
+export IMAGE_NAME_2="example2"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --image \
+    --name_2=$IMAGE_NAME_2 \
+    --resolution=1024 \
+    --embedding_learning_rate=1e-4  \
+    --diffusion_model_learning_rate=5e-5 \
+    --max_emb_train_steps=500  \
+    --max_diffusion_train_steps=500 \
+    --train_batch_size=1 \
+    --gradient_accumulation_steps=5

scripts/sdxl/run_ft_sdxl_1024_auxin_todo.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --resolution=1024 \
+    --train_full_lora \
+    --embedding_learning_rate=1e-4  \
+    --diffusion_model_learning_rate=1e-3 \
+    --max_emb_train_steps=500  \
+    --max_diffusion_train_steps=500 \
+    --train_batch_size=2 \
+    --gradient_accumulation_steps=5 \
+    --prompt_auxin_idx_list 0 2 \
+    --prompt_auxin_list "a photo of * handbag" "a photo of * model"
+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --resolution=1024 \
+    --train_full_lora \
+    --load_trained \
+    --recon \
+    --seed=23 \
+    --guidance_scale=7 \
+    --num_sampling_step=20 \
+    --num_imgs=2 \
+    --prompt_auxin_idx_list 0 2 \
+    --prompt_auxin_list "a photo of * handbag" "a photo of * model"

scripts/sdxl/run_ft_sdxl_1024_fulllora.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --train_full_lora \
+    --resolution=1024 \
+    --embedding_learning_rate=1e-4  \
+    --diffusion_model_learning_rate=1e-4 \
+    --max_emb_train_steps=500  \
+    --max_diffusion_train_steps=500 \
+    --train_batch_size=2 \
+    --gradient_accumulation_steps=5

scripts/sdxl/run_ft_sdxl_1024_fulllora_2imgs.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+export IMAGE_NAME="example1"
+export IMAGE_NAME_2="example2"
+# python main.py --name=$IMAGE_NAME \
+#     --dpm="sdxl" \
+#     --image \
+#     --name_2=$IMAGE_NAME_2 \
+#     --resolution=1024 \
+#     --embedding_learning_rate=1e-4  \
+#     --diffusion_model_learning_rate=5e-5 \
+#     --max_emb_train_steps=500  \
+#     --max_diffusion_train_steps=500 \
+#     --train_batch_size=1 \
+#     --gradient_accumulation_steps=5
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --image \
+    --train_full_lora \
+    --name_2=$IMAGE_NAME_2 \
+    --resolution=1024 \
+    --embedding_learning_rate=1e-4  \
+    --diffusion_model_learning_rate=5e-4 \
+    --max_emb_train_steps=500  \
+    --max_diffusion_train_steps=500 \
+    --train_batch_size=1 \
+    --gradient_accumulation_steps=5
+# python main.py --load_trained \
+#     --dpm="sdxl" \
+#     --image \
+#     --name=$IMAGE_NAME \
+#     --name_2=$IMAGE_NAME_2 \
+#     --tgt_name=$IMAGE_NAME \
+#     --guidance_scale 2.5 \
+#     --edge_thickness 40 \
+#     --strength 0.5 \
+#     --seed 29 \
+#     --num_imgs 4 \
+#     --tgt_index=0 \
+#     --src_index=2

scripts/sdxl/run_image.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+export IMAGE_NAME="example1"
+export IMAGE_NAME_2="example2"
+python main.py --name=$IMAGE_NAME \
+    --name_2=$IMAGE_NAME_2 \
+    --dpm="sdxl" \
+    --image \
+    --load_trained \
+    --resolution=1024 \
+    --guidance_scale=2.8 \
+    --num_imgs=2 \
+    --seed=2023 \
+    --strength=0.5 \
+    --edge_thickness=20 \
+    --src_index=2   --tgt_index=0  \
+    --tgt_name=$IMAGE_NAME

scripts/sdxl/run_image_w_edited_mask.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+export IMAGE_NAME="example1"
+export IMAGE_NAME_2="example2"
+python main.py --name=$IMAGE_NAME \
+    --name_2=$IMAGE_NAME_2 \
+    --dpm="sdxl" \
+    --image \
+    --load_trained \
+    --load_edited_mask \
+    --resolution=1024 \
+    --guidance_scale=2.8 \
+    --num_imgs=2 \
+    --seed=2023 \
+    --strength=0.5 \
+    --edge_thickness=20 \
+    --src_index=2   --tgt_index=0  \
+    --tgt_name=$IMAGE_NAME

scripts/sdxl/run_move_resize.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export IMAGE_NAME="example1"
+CUDA_VISIBLE_DEVICES=1 python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --resolution=1024 \
+    --num_tokens=5 \
+    --load_edited_mask \
+    --load_trained \
+    --move_resize \
+    --seed=2023 \
+    --num_sampling_step=20 \
+    --strength=0.5 \
+    --edge_thickness=20 \
+    --guidance_scale=2.8 \
+    --num_imgs=2 \
+    --tgt_indices_list 0 \
+    --active_mask_list 2 \
+    --delta_x 200 --delta_y 140  \
+    --resize_list 0.5 \
+    --priority_list 1
+    # --load_edited_processed_mask

scripts/sdxl/run_recon.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --resolution=1024 \
+    --num_tokens=5 \
+    --load_trained \
+    --recon \
+    --seed=20 \
+    --guidance_scale=3 \
+    --num_sampling_step=20 \
+    --num_imgs=2 \

scripts/sdxl/run_recon_item_todo.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+# export IMAGE_NAME="example1"
+# python main.py --name=$IMAGE_NAME \
+#     --dpm="sdxl" \
+#     --resolution=1024 \
+#     --load_trained \
+#     --recon \
+#     --recon_an_item \
+#     --seed=23 \
+#     --guidance_scale=6 \
+#     --num_sampling_step=20 \
+#     --num_imgs=2  \
+#     --tgt_index=0 \
+#     --recon_prompt="a photo of a * handbag on a table"
+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --resolution=1024 \
+    --load_trained \
+    --recon \
+    --recon_an_item \
+    --seed=23 \
+    --guidance_scale=6 \
+    --num_sampling_step=20 \
+    --num_imgs=2  \
+    --tgt_index=2 \
+    --recon_prompt="a photo of a * model on a chair"

scripts/sdxl/run_remove.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --resolution=1024 \
+    --num_tokens=5 \
+    --load_edited_mask \
+    --load_trained \
+    --remove \
+    --seed=0 \
+    --num_sampling_step=20 \
+    --strength=0.4 \
+    --edge_thickness=20 \
+    --guidance_scale=3 \
+    --num_imgs=1 \
+    --tgt_index=0
+    # --load_edited_processed_mask

scripts/sdxl/run_text.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --resolution=1024 \
+    --load_trained \
+    --num_tokens=5 \
+    --text \
+    --seed=23 \
+    --num_sampling_step=20 \
+    --strength=0.6 \
+    --edge_thickness=30 \
+    --num_imgs=2 \
+    --tgt_prompt="a white handbag" \
+    --tgt_index=0

scripts/sdxl/run_text_w_edited_mask.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+export IMAGE_NAME="example1"
+python main.py --name=$IMAGE_NAME \
+    --dpm="sdxl" \
+    --resolution=1024 \
+    --num_tokens=5 \
+    --load_trained \
+    --load_edited_mask \
+    --text \
+    --seed=23 \
+    --num_sampling_step=50 \
+    --strength=0.7 \
+    --edge_thickness=30 \
+    --num_imgs=2 \
+    --tgt_prompt="a white handbag" \
+    --tgt_index=0

segment.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
+from PIL import Image
+import torch
+from collections import defaultdict
+import matplotlib.pyplot as plt
+from matplotlib import cm
+import matplotlib.patches as mpatches
+import os
+import numpy as np
+import argparse
+import matplotlib
+def load_image(image_path, left=0, right=0, top=0, bottom=0, size = 512):
+    if type(image_path) is str:
+        image = np.array(Image.open(image_path))[:, :, :3]
+    else:
+        image = image_path
+    h, w, c = image.shape
+    left = min(left, w-1)
+    right = min(right, w - left - 1)
+    top = min(top, h - left - 1)
+    bottom = min(bottom, h - top - 1)
+    image = image[top:h-bottom, left:w-right]
+    h, w, c = image.shape
+    if h < w:
+        offset = (w - h) // 2
+        image = image[:, offset:offset + h]
+    elif w < h:
+        offset = (h - w) // 2
+        image = image[offset:offset + w]
+    image = np.array(Image.fromarray(image).resize((size, size)))
+    return image
+def draw_panoptic_segmentation(segmentation, segments_info,save_folder=None, noseg = False):
+    if torch.max(segmentation)==torch.min(segmentation)==-1:
+        print("nothing is detected!")
+        noseg=True
+        viridis = matplotlib.colormaps['viridis'].resampled(1)
+    else:
+        viridis = matplotlib.colormaps['viridis'].resampled(torch.max(segmentation)-torch.min(segmentation)+1)
+    fig, ax = plt.subplots()
+    ax.imshow(segmentation)
+    instances_counter = defaultdict(int)
+    handles = []
+    label_list = []
+    if not noseg:
+        if torch.min(segmentation) == 0:
+            mask = segmentation==0
+            mask = mask.cpu().detach().numpy()   # [512,512]   bool
+            segment_label = "rest"
+            np.save( os.path.join(save_folder, "mask{}_{}.npy".format(0,"rest")) , mask)
+            color = viridis(0)
+            label = f"{segment_label}-{0}"
+            handles.append(mpatches.Patch(color=color, label=label))
+            label_list.append(label)
+        for  segment in segments_info:
+            segment_id = segment['id']
+            mask = segmentation==segment_id
+            if torch.min(segmentation) != 0:
+                segment_id -= 1
+            mask = mask.cpu().detach().numpy()   # [512,512] bool
+            segment_label = model.config.id2label[segment['label_id']]
+            instances_counter[segment['label_id']] += 1
+            np.save( os.path.join(save_folder, "mask{}_{}.npy".format(segment_id,segment_label)) , mask)
+            color = viridis(segment_id)
+            label = f"{segment_label}-{segment_id}"
+            handles.append(mpatches.Patch(color=color, label=label))
+            label_list.append(label)
+    else:
+        mask = np.full(segmentation.shape, True)
+        segment_label = "all"
+        np.save( os.path.join(save_folder, "mask{}_{}.npy".format(0,"all")) , mask)
+        color = viridis(0)
+        label = f"{segment_label}-{0}"
+        handles.append(mpatches.Patch(color=color, label=label))
+        label_list.append(label)
+    plt.xticks([])
+    plt.yticks([])
+    # plt.savefig(os.path.join(save_folder, 'mask_clear.png'), dpi=500)
+    ax.legend(handles=handles)
+    plt.savefig(os.path.join(save_folder, 'seg_init.png'), dpi=500 )
+    print("; ".join(label_list))
+parser = argparse.ArgumentParser()
+parser.add_argument("--name",  type=str, default="obama")
+parser.add_argument("--size",  type=int, default=512)
+parser.add_argument("--noseg", default=False, action="store_true" )
+args = parser.parse_args()
+base_folder_path = "."
+processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-base-coco-panoptic")
+model = Mask2FormerForUniversalSegmentation.from_pretrained("facebook/mask2former-swin-base-coco-panoptic")
+input_folder = os.path.join(base_folder_path, args.name )
+try:
+    image = load_image(os.path.join(input_folder, "img.png" ), size = args.size)
+except:
+    image = load_image(os.path.join(input_folder, "img.jpg" ), size = args.size)
+image =Image.fromarray(image)
+image.save(os.path.join(input_folder,"img_{}.png".format(args.size)))
+inputs = processor(image, return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs)
+panoptic_segmentation = processor.post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
+save_folder = os.path.join(base_folder_path, args.name)
+os.makedirs(save_folder, exist_ok=True)
+draw_panoptic_segmentation(**panoptic_segmentation, save_folder = save_folder, noseg = args.noseg)

segment_sam.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import argparse
+import os
+import copy
+import shutil
+import numpy as np
+import json
+import torch
+from PIL import Image, ImageDraw, ImageFont
+# Grounding DINO
+import sys
+sys.path.append("/path/to/Grounded-Segment-Anything")
+# change to your "Grounded-Segment-Anything" installation folder!!!!!
+import GroundingDINO.groundingdino.datasets.transforms as T
+from GroundingDINO.groundingdino.models import build_model
+from GroundingDINO.groundingdino.util import box_ops
+from GroundingDINO.groundingdino.util.slconfig import SLConfig
+from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+# segment anything
+from segment_anything import (
+    sam_model_registry,
+    sam_hq_model_registry,
+    SamPredictor
+)
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+def load_image_to_resize(image_path, left=0, right=0, top=0, bottom=0, size = 512):
+    if type(image_path) is str:
+        image = np.array(Image.open(image_path))[:, :, :3]
+    else:
+        image = image_path
+    h, w, c = image.shape
+    left = min(left, w-1)
+    right = min(right, w - left - 1)
+    top = min(top, h - left - 1)
+    bottom = min(bottom, h - top - 1)
+    image = image[top:h-bottom, left:w-right]
+    h, w, c = image.shape
+    if h < w:
+        offset = (w - h) // 2
+        image = image[:, offset:offset + h]
+    elif w < h:
+        offset = (h - w) // 2
+        image = image[offset:offset + w]
+    image = np.array(Image.fromarray(image).resize((size, size)))
+    return image
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+def load_model(model_config_path, model_checkpoint_path, device):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    model.eval()
+    return model
+def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    model = model.to(device)
+    image = image.to(device)
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
+    logits.shape[0]
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    logits_filt.shape[0]
+    # get phrase
+    tokenlizer = model.tokenizer
+    tokenized = tokenlizer(caption)
+    # build pred
+    pred_phrases = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+        if with_logits:
+            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+        else:
+            pred_phrases.append(pred_phrase)
+    return boxes_filt, pred_phrases
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax, label):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
+    ax.text(x0, y0, label)
+def save_mask_data(output_dir, mask_list, box_list, label_list):
+    value = 0  # 0 for background
+    mask_img = torch.zeros(mask_list.shape[-2:])
+    for idx, mask in enumerate(mask_list):
+        mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1
+    plt.figure(figsize=(10, 10))
+    plt.imshow(mask_img.numpy())
+    plt.axis('off')
+    plt.savefig(os.path.join(output_dir, 'mask.jpg'), bbox_inches="tight", dpi=300, pad_inches=0.0)
+    json_data = [{
+        'value': value,
+        'label': 'background'
+    }]
+    for label, box in zip(label_list, box_list):
+        value += 1
+        name, logit = label.split('(')
+        logit = logit[:-1] # the last is ')'
+        json_data.append({
+            'value': value,
+            'label': name,
+            'logit': float(logit),
+            'box': box.numpy().tolist(),
+        })
+    with open(os.path.join(output_dir, 'mask.json'), 'w') as f:
+        json.dump(json_data, f)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounded-Segment-Anything Demo", add_help=True)
+    parser.add_argument("--sam_version", type=str, default="vit_h", required=False, help="SAM ViT version: vit_b / vit_l / vit_h")
+    parser.add_argument("--sam_checkpoint", type=str, required=False, help="path to sam checkpoint file")
+    parser.add_argument("--sam_hq_checkpoint", type=str, default=None, help="path to sam-hq checkpoint file")
+    parser.add_argument("--use_sam_hq", action="store_true", help="using sam-hq for prediction")
+    parser.add_argument("--text_prompt", type=str, required=True, help="text prompt")
+    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
+    parser.add_argument("--name", type=str, default="", help="name of the input image folder")
+    parser.add_argument("--size", type=int, default=1024, help="image size")
+    args = parser.parse_args()
+    args.base_folder = "/path/to/Grounded-Segment-Anything"
+    # change to your "Grounded-Segment-Anything" installation folder!!!!!
+    input_folder = os.path.join(".", args.name)
+    args.config = os.path.join(args.base_folder,"GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
+    args.grounded_checkpoint = "groundingdino_swint_ogc.pth"
+    args.sam_checkpoint="sam_vit_h_4b8939.pth"
+    args.box_threshold = 0.3
+    args.text_threshold = 0.25
+    args.device = "cuda"
+    # cfg
+    config_file = args.config  # change the path of the model config file
+    grounded_checkpoint = os.path.join(args.base_folder,args.grounded_checkpoint)  # change the path of the model
+    sam_version = args.sam_version
+    sam_checkpoint = os.path.join(args.base_folder,args.sam_checkpoint)
+    if args.sam_hq_checkpoint is not None:
+        sam_hq_checkpoint = os.path.join(args.base_folder,args.sam_hq_checkpoint)
+    use_sam_hq = args.use_sam_hq
+    # image_path = args.input_image
+    text_prompt = args.text_prompt
+    # output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    device = args.device
+    output_dir = input_folder
+    os.makedirs(output_dir, exist_ok=True)
+    # unify names
+    if len(os.listdir(input_folder)) == 1:
+        for filename in os.listdir(input_folder):
+            imgtype = "." + filename.split(".")[-1]
+            shutil.move(os.path.join(input_folder, filename), os.path.join(input_folder, "img"+imgtype))
+    ### resizing and save
+    if os.path.exists(os.path.join(input_folder, "img.jpg")):
+        image_path = os.path.join(input_folder, "img.jpg")
+    else:
+        image_path = os.path.join(input_folder, "img.png")
+    image = load_image_to_resize(image_path, size = args.size)
+    image =Image.fromarray(image)
+    resized_image_path = os.path.join(input_folder, "img_{}.png".format(args.size))
+    image.save(resized_image_path)
+    image_path = resized_image_path
+    # load image
+    image_pil, image = load_image(image_path)
+    # load model
+    model = load_model(config_file, grounded_checkpoint, device=device)
+    # # visualize raw image
+    # image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    # run grounding dino model
+    boxes_filt, pred_phrases = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, device=device
+    )
+    # initialize SAM
+    if use_sam_hq:
+        predictor = SamPredictor(sam_hq_model_registry[sam_version](checkpoint=sam_hq_checkpoint).to(device))
+    else:
+        predictor = SamPredictor(sam_model_registry[sam_version](checkpoint=sam_checkpoint).to(device))
+    image = cv2.imread(image_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    predictor.set_image(image)
+    size = image_pil.size
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+    boxes_filt = boxes_filt.cpu()
+    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2]).to(device)
+    masks, _, _ = predictor.predict_torch(
+        point_coords = None,
+        point_labels = None,
+        boxes = transformed_boxes.to(device),
+        multimask_output = False,
+    )
+    tot_detect = len(masks)
+    # draw output image
+    plt.figure(figsize=(10, 10))
+    plt.imshow(image)
+    for idx, (mask,label) in enumerate(zip(masks,pred_phrases)):
+        show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+        np.save( os.path.join(output_dir, "maskSAM{}_{}.npy".format(idx, label)) ,mask[0].cpu().numpy())
+    for idx, (box, label) in enumerate(zip(boxes_filt, pred_phrases)):
+        label = label + "_{}".format(idx)
+        show_box(box.numpy(), plt.gca(), label)
+    rec_mask = np.zeros_like(mask[0].cpu().numpy()).astype(np.bool_)
+    for idx, box in enumerate(boxes_filt):
+        up = box[0].numpy().astype(np.int32)
+        down = box[2].numpy().astype(np.int32)
+        left = box[1].numpy().astype(np.int32)
+        right = box[3].numpy().astype(np.int32)
+        rec_mask[left:right, up:down] = True
+    plt.axis('off')
+    plt.savefig(
+        os.path.join(output_dir, "seg_init_SAM.png"),
+        bbox_inches="tight", dpi=300, pad_inches=0.0
+    )
+    mask_detected = np.logical_or.reduce([mask[0].cpu().numpy() for mask in masks ])
+    mask_undetected = np.logical_not(mask_detected)
+    np.save( os.path.join(output_dir, "SAM_detected.npy") ,mask_detected)
+    np.save( os.path.join(output_dir, "maskSAM{}_rest.npy".format(len(masks)))     ,mask_undetected)
+    plt.imsave( os.path.join(output_dir,"mask_SAM-detected.png"), np.repeat(np.expand_dims( mask_detected.astype(float), axis=2), 3, axis = 2))

utils.py ADDED Viewed

	@@ -0,0 +1,326 @@

+from transformers import PretrainedConfig
+from PIL import Image
+import torch
+import numpy as np
+import PIL
+import os
+from tqdm.auto import tqdm
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+def myroll2d(a, delta_x, delta_y):
+    h, w = a.shape[0],  a.shape[1]
+    delta_x = -delta_x
+    delta_y = -delta_y
+    if isinstance(a, np.ndarray):
+        b = np.zeros   ([h,w]).astype(np.uint8)
+    elif isinstance(a, torch.Tensor):
+        b = torch.zeros([h,w]).to(torch.uint8)
+    if delta_x > 0:
+        left_a = delta_x
+        right_a = w
+        left_b = 0
+        right_b = w - delta_x
+    else:
+        left_a = 0
+        right_a = w + delta_x
+        left_b = -delta_x
+        right_b =  w
+    if delta_y > 0:
+        top_a = delta_y
+        bot_a = h
+        top_b = 0
+        bot_b = h-delta_y
+    else:
+        top_a = 0
+        bot_a = h + delta_y
+        top_b = -delta_y
+        bot_b = h
+    b[left_b: right_b, top_b: bot_b] = a[left_a: right_a, top_a: bot_a]
+    return b
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision = None, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+@torch.no_grad()
+def image2latent(image, vae = None, dtype=None):
+    with torch.no_grad():
+        if type(image) is Image or type(image) is PIL.PngImagePlugin.PngImageFile or type(image) is PIL.JpegImagePlugin.JpegImageFile:
+            image = np.array(image)
+        if type(image) is torch.Tensor and image.dim() == 4:
+            latents = image
+        else:
+            image = torch.from_numpy(image).float() / 127.5 - 1
+            image = image.permute(2, 0, 1).unsqueeze(0).to(device, dtype= dtype)
+            latents = vae.encode(image).latent_dist.sample()
+            latents = latents * vae.config.scaling_factor
+    return latents
+@torch.no_grad()
+def latent2image(latents, return_type = 'np', vae = None):
+    # needs_upcasting = vae.dtype == torch.float16 and vae.config.force_upcast
+    needs_upcasting = True
+    if needs_upcasting:
+        upcast_vae(vae)
+        latents = latents.to(next(iter(vae.post_quant_conv.parameters())).dtype)
+    image = vae.decode(latents /vae.config.scaling_factor, return_dict=False)[0]
+    if return_type == 'np':
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()#[0]
+        image = (image * 255).astype(np.uint8)
+    if needs_upcasting:
+        vae.to(dtype=torch.float16)
+    return image
+def upcast_vae(vae):
+    dtype = vae.dtype
+    vae.to(dtype=torch.float32)
+    use_torch_2_0_or_xformers = isinstance(
+        vae.decoder.mid_block.attentions[0].processor,
+        (
+            AttnProcessor2_0,
+            XFormersAttnProcessor,
+            LoRAXFormersAttnProcessor,
+            LoRAAttnProcessor2_0,
+        ),
+    )
+    # if xformers or torch_2_0 is used attention block does not need
+    # to be in float32 which can save lots of memory
+    if use_torch_2_0_or_xformers:
+        vae.post_quant_conv.to(dtype)
+        vae.decoder.conv_in.to(dtype)
+        vae.decoder.mid_block.to(dtype)
+def prompt_to_emb_length_sdxl(prompt, tokenizer, text_encoder, length = None):
+    text_input = tokenizer(
+        [prompt],
+        padding="max_length",
+        max_length=length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    prompt_embeds = text_encoder(text_input.input_ids.to(device),output_hidden_states=True)
+    pooled_prompt_embeds = prompt_embeds[0]
+    prompt_embeds = prompt_embeds.hidden_states[-2]
+    bs_embed, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return  {"prompt_embeds": prompt_embeds, "pooled_prompt_embeds": pooled_prompt_embeds}
+def prompt_to_emb_length_sd(prompt, tokenizer, text_encoder,  length = None):
+    text_input = tokenizer(
+        [prompt],
+        padding="max_length",
+        max_length=length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    emb = text_encoder(text_input.input_ids.to(device))[0]
+    return  emb
+def sdxl_prepare_input_decom(
+    set_string_list,
+    tokenizer,
+    tokenizer_2,
+    text_encoder_1,
+    text_encoder_2,
+    length = 20,
+    bsz = 1,
+    weight_dtype = torch.float32,
+    resolution = 1024,
+    normal_token_id_list = []
+):
+    encoder_hidden_states_list = []
+    pooled_prompt_embeds = 0
+    for m_idx in range(len(set_string_list)):
+        prompt_embeds_list = []
+        if ("#" in set_string_list[m_idx] or "$" in set_string_list[m_idx]) and m_idx not in normal_token_id_list :  ###
+            out = prompt_to_emb_length_sdxl(
+                set_string_list[m_idx], tokenizer, text_encoder_1, length = length
+            )
+        else:
+            out = prompt_to_emb_length_sdxl(
+                set_string_list[m_idx], tokenizer, text_encoder_1, length = 77
+            )
+            print(m_idx, set_string_list[m_idx])
+        prompt_embeds, _ = out["prompt_embeds"].to(dtype=weight_dtype), out["pooled_prompt_embeds"].to(dtype=weight_dtype)
+        prompt_embeds = prompt_embeds.repeat(bsz, 1, 1)
+        prompt_embeds_list.append(prompt_embeds)
+        if ("#" in set_string_list[m_idx] or "$" in set_string_list[m_idx]) and m_idx not in  normal_token_id_list:
+            out = prompt_to_emb_length_sdxl(
+                set_string_list[m_idx], tokenizer_2, text_encoder_2, length = length
+            )
+        else:
+            out = prompt_to_emb_length_sdxl(
+                set_string_list[m_idx], tokenizer_2, text_encoder_2, length = 77
+            )
+            print(m_idx, set_string_list[m_idx])
+        prompt_embeds = out["prompt_embeds"].to(dtype=weight_dtype)
+        pooled_prompt_embeds += out["pooled_prompt_embeds"].to(dtype=weight_dtype)
+        prompt_embeds = prompt_embeds.repeat(bsz, 1, 1)
+        prompt_embeds_list.append(prompt_embeds)
+        encoder_hidden_states_list.append(torch.concat(prompt_embeds_list, dim=-1))
+    add_text_embeds = pooled_prompt_embeds /len(set_string_list)
+    target_size, original_size,crops_coords_top_left = (resolution,resolution),(resolution,resolution),(0,0)
+    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+    add_time_ids = torch.tensor([add_time_ids], dtype=prompt_embeds.dtype,device = pooled_prompt_embeds.device) #[B,6]
+    return encoder_hidden_states_list, add_text_embeds, add_time_ids
+def sd_prepare_input_decom(
+    set_string_list,
+    tokenizer,
+    text_encoder_1,
+    length = 20,
+    bsz = 1,
+    weight_dtype = torch.float32,
+    normal_token_id_list = []
+):
+    encoder_hidden_states_list = []
+    for m_idx in range(len(set_string_list)):
+        if ("#" in set_string_list[m_idx] or "$" in set_string_list[m_idx]) and m_idx not in normal_token_id_list :  ###
+            encoder_hidden_states = prompt_to_emb_length_sd(
+                set_string_list[m_idx], tokenizer, text_encoder_1, length = length
+            )
+        else:
+            encoder_hidden_states = prompt_to_emb_length_sd(
+                set_string_list[m_idx], tokenizer, text_encoder_1, length = 77
+            )
+            print(m_idx, set_string_list[m_idx])
+        encoder_hidden_states = encoder_hidden_states.repeat(bsz, 1, 1)
+        encoder_hidden_states_list.append(encoder_hidden_states.to(dtype=weight_dtype))
+    return encoder_hidden_states_list
+def load_mask (input_folder):
+    np_mask_dtype = 'uint8'
+    mask_np_list = []
+    mask_label_list = []
+    files = [
+        file_name for file_name in os.listdir(input_folder) \
+        if "mask" in file_name and ".npy" in file_name \
+        and "_" in file_name and "Edited"  not in file_name
+    ]
+    files = sorted(files, key = lambda x: int(x.split("_")[0][4:]))
+    for idx, file_name in enumerate(files):
+        if "mask" in file_name and ".npy" in file_name and "_" in file_name \
+            and "Edited"  not in file_name:
+            mask_np =  np.load(os.path.join(input_folder, file_name)).astype(np_mask_dtype)
+            mask_np_list.append(mask_np)
+            mask_label = file_name.split("_")[1][:-4]
+            mask_label_list.append(mask_label)
+    mask_list = []
+    for mask_np in mask_np_list:
+        mask = torch.from_numpy(mask_np)
+        mask_list.append(mask)
+    try:
+        assert torch.all(sum(mask_list)==1)
+    except:
+        print("please check mask")
+        # plt.imsave( "out_mask.png", mask_list_edit[0])
+        import pdb; pdb.set_trace()
+    return mask_list, mask_label_list
+def load_image(image_path, left=0, right=0, top=0, bottom=0, size = 512):
+    if type(image_path) is str:
+        image = np.array(Image.open(image_path))[:, :, :3]
+    else:
+        image = image_path
+    h, w, c = image.shape
+    left = min(left, w-1)
+    right = min(right, w - left - 1)
+    top = min(top, h - left - 1)
+    bottom = min(bottom, h - top - 1)
+    image = image[top:h-bottom, left:w-right]
+    h, w, c = image.shape
+    if h < w:
+        offset = (w - h) // 2
+        image = image[:, offset:offset + h]
+    elif w < h:
+        offset = (h - w) // 2
+        image = image[offset:offset + w]
+    image = np.array(Image.fromarray(image).resize((size, size)))
+    return image
+def mask_union_torch(*masks):
+    masks = [m.to(torch.float) for m in masks]
+    res = sum(masks)>0
+    return res
+def load_mask_edit(input_folder):
+    np_mask_dtype = 'uint8'
+    mask_np_list = []
+    mask_label_list = []
+    files = [file_name for file_name in os.listdir(input_folder)  if "mask" in file_name and ".npy" in file_name and "_" in file_name and "Edited" in file_name and "-1" not in file_name]
+    files = sorted(files, key = lambda x: int(x.split("_")[0][10:]))
+    for idx, file_name in enumerate(files):
+        if "mask" in file_name and ".npy" in file_name and "_" in file_name and "Edited" in file_name and "-1" not in file_name:
+            mask_np =  np.load(os.path.join(input_folder, file_name)).astype(np_mask_dtype)
+            mask_np_list.append(mask_np)
+            mask_label = file_name.split("_")[1][:-4]
+            # mask_label = mask_label.split("-")[0]
+            mask_label_list.append(mask_label)
+    mask_list = []
+    for mask_np in mask_np_list:
+        mask = torch.from_numpy(mask_np)
+        mask_list.append(mask)
+    try:
+        assert torch.all(sum(mask_list)==1)
+    except:
+        print("Make sure maskEdited is in the folder, if not, generate using the UI")
+        import pdb; pdb.set_trace()
+    return mask_list, mask_label_list
+def save_images(images,filename, num_rows=1, offset_ratio=0.02):
+    if type(images) is list:
+        num_empty = len(images) % num_rows
+    elif images.ndim == 4:
+        num_empty = images.shape[0] % num_rows
+    else:
+        images = [images]
+        num_empty = 0
+    empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255
+    images = [image.astype(np.uint8) for image in images] + [empty_images] * num_empty
+    num_items = len(images)
+    folder = os.path.dirname(filename)
+    for i, image in enumerate(images):
+        pil_img = Image.fromarray(image)
+        name = filename.split("/")[-1]
+        name = name.split(".")[-2]+"_{}".format(i) +"."+filename.split(".")[-1]
+        pil_img.save(os.path.join(folder, name))
+        print("saved to ", os.path.join(folder, name))

utils_mask.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import os
+import numpy as np
+from matplotlib import cm
+import matplotlib.patches as mpatches
+import matplotlib.pyplot as plt
+import torch
+from utils import myroll2d
+def create_outer_edge_mask_torch(mask, edge_thickness = 20):
+    mask_down = myroll2d(mask, edge_thickness, 0 )
+    mask_edge_down = (mask_down.to(torch.float) -mask.to(torch.float))>0
+    mask_up  = myroll2d(mask, -edge_thickness, 0)
+    mask_edge_up = (mask_up.to(torch.float) -mask.to(torch.float))>0
+    mask_left  = myroll2d(mask, 0, -edge_thickness)
+    mask_edge_left = (mask_left.to(torch.float) -mask.to(torch.float))>0
+    mask_right  = myroll2d(mask, 0, edge_thickness)
+    mask_edge_right = (mask_right.to(torch.float) -mask.to(torch.float))>0
+    mask_ur =  myroll2d(mask, -edge_thickness,edge_thickness)
+    mask_edge_ur = (mask_ur.to(torch.float) -mask.to(torch.float))>0
+    mask_ul =  myroll2d(mask, -edge_thickness,-edge_thickness)
+    mask_edge_ul = (mask_ul.to(torch.float) -mask.to(torch.float))>0
+    mask_dr =  myroll2d(mask, edge_thickness,edge_thickness )
+    mask_edge_dr = (mask_dr.to(torch.float) -mask.to(torch.float))>0
+    mask_dl =  myroll2d(mask, edge_thickness,-edge_thickness)
+    mask_edge_ul = (mask_dl.to(torch.float) -mask.to(torch.float))>0
+    mask_edge = mask_union_torch(mask_edge_down, mask_edge_up, mask_edge_left, mask_edge_right,
+                            mask_edge_ur, mask_edge_ul, mask_edge_dr, mask_edge_ul)
+    return mask_edge
+def mask_substract_torch(mask1, mask2):
+    return ((mask1.cpu().to(torch.float)-mask2.cpu().to(torch.float))>0).to(torch.uint8)
+def check_mask_overlap_torch(*masks):
+    assert torch.any(sum([m.float() for m in masks])<=1 )
+def check_mask_overlap_numpy(*masks):
+    assert np.all(sum([m.astype(float) for m in masks])<=1 )
+def check_cover_all_torch (*masks):
+    assert torch.all(sum([m.cpu().float() for m in masks])==1)
+def process_mask_to_follow_priority(mask_list, priority_list):
+    for idx1, (m1 , p1) in enumerate(zip(mask_list, priority_list)):
+        for idx2, (m2 , p2) in enumerate(zip(mask_list, priority_list)):
+            if p2 > p1:
+                mask_list[idx1] = ((m1.astype(float)-m2.astype(float))>0).astype(np.uint8)
+    return mask_list
+def mask_union(*masks):
+    masks = [m.astype(float) for m in masks]
+    res = sum(masks)>0
+    return res.astype(np.uint8)
+def mask_intersection(mask1, mask2):
+    mask_uni =  mask_union(mask1, mask2)
+    mask_intersec = ((mask1.astype(float)-mask2.astype(float))==0) * mask_uni
+    return mask_intersec
+def mask_union_torch(*masks):
+    masks = [m.float() for m in masks]
+    res = sum(masks)>0
+    return res.to(torch.uint8)
+def mask_intersection_torch(mask1, mask2):
+    mask_uni =  mask_union_torch(mask1, mask2)
+    mask_intersec = ((mask1.float()-mask2.float())==0) * mask_uni
+    return mask_intersec.cpu().to(torch.uint8)
+def visualize_mask_list(mask_list, savepath):
+    mask = 0
+    for midx, m in enumerate(mask_list):
+        try:
+            mask += m.astype(float)* midx
+        except:
+            mask += m.float()*midx
+    viridis = cm.get_cmap('viridis', len(mask_list))
+    fig, ax = plt.subplots()
+    ax.imshow( mask)
+    handles = []
+    label_list = []
+    for idx , _ in enumerate(mask_list):
+        color = viridis(idx)
+        label = f"{idx}"
+        handles.append(mpatches.Patch(color=color, label=label))
+        label_list.append(label)
+    ax.legend(handles=handles)
+    plt.savefig(savepath)
+def visualize_mask_list_clean(mask_list, savepath):
+    mask = 0
+    for midx, m in enumerate(mask_list):
+        try:
+            mask += m.astype(float)* midx
+        except:
+            mask += m.float()*midx
+    viridis = cm.get_cmap('viridis', len(mask_list))
+    fig, ax = plt.subplots()
+    ax.imshow( mask)
+    handles = []
+    label_list = []
+    for idx , _ in enumerate(mask_list):
+        color = viridis(idx)
+        label = f"{idx}"
+        handles.append(mpatches.Patch(color=color, label=label))
+        label_list.append(label)
+    # ax.legend(handles=handles)
+    plt.savefig(savepath,  dpi=500)
+def move_mask(mask_select, delta_x, delta_y):
+    mask_edit = myroll2d(mask_select, delta_y, delta_x)
+    return mask_edit
+def stack_mask_with_priority (mask_list_np, priority_list, edit_idx_list):
+    mask_sel = mask_union(*[mask_list_np[eid] for eid in edit_idx_list])
+    for midx, mask in enumerate(mask_list_np):
+        if midx not in edit_idx_list:
+            if priority_list[edit_idx_list[0]] >= priority_list[midx]:
+                mask = mask.astype(float) - np.logical_and(mask.astype(bool) , mask_sel.astype(bool)).astype(float)
+            mask_list_np[midx] = mask.astype("uint8")
+    for midx  in edit_idx_list:
+        for midx_1 in edit_idx_list:
+            if midx != midx_1:
+                if priority_list[midx] <= priority_list[midx_1]:
+                    mask = mask_list_np[midx].astype(float) - np.logical_and(mask_list_np[midx].astype(bool), mask_list_np[midx_1].astype(bool)).astype(float)
+                    mask_list_np[midx] = mask.astype("uint8")
+    return mask_list_np
+def process_remain_mask(mask_list, edit_idx_list = None, force_mask_remain = None):
+    print("Start to process remaining mask using nearest neighbor")
+    width = mask_list[0].shape[0]
+    height = mask_list[0].shape[1]
+    pixel_ind = np.arange( width* height)
+    y_axis = np.arange(width)
+    ymesh = np.repeat(y_axis[:,np.newaxis], height, axis = 1) #N, N
+    ymesh_vec = ymesh.reshape(-1)                           #N *N
+    x_axis = np.arange(height)
+    xmesh = np.repeat(x_axis[np.newaxis, : ], width, axis = 0)
+    xmesh_vec = xmesh.reshape(-1)
+    mask_remain = (1 - sum([m.astype(float) for m in mask_list])).astype(np.uint8)
+    if force_mask_remain is not None:
+        mask_list[force_mask_remain] = (mask_list[force_mask_remain].astype(float) + mask_remain.astype(float)).astype(np.uint8)
+    else:
+        if edit_idx_list is not None:
+            a = [mask_list[eidx] for eidx in edit_idx_list]
+            mask_edit = mask_union(*a)
+        else:
+            mask_edit = np.zeros_like(mask_remain).astype(np.uint8)
+        mask_feasible = (1 - mask_remain.astype(float) - mask_edit.astype(float)).astype(np.uint8)
+        edge_width = 2
+        mask_feasible_down  = myroll2d(mask_feasible, edge_width, 0)
+        mask_edge_down = (mask_feasible_down.astype(float) -mask_feasible.astype(float))<0
+        mask_feasible_up  = myroll2d(mask_feasible, -edge_width, 0)
+        mask_edge_up = (mask_feasible_up.astype(float) -mask_feasible.astype(float))<0
+        mask_feasible_left  = myroll2d(mask_feasible, 0, -edge_width)
+        mask_edge_left = (mask_feasible_left.astype(float) -mask_feasible.astype(float))<0
+        mask_feasible_right  = myroll2d(mask_feasible, 0, edge_width)
+        mask_edge_right = (mask_feasible_right.astype(float) -mask_feasible.astype(float))<0
+        mask_feasible_ur =  myroll2d(mask_feasible, -edge_width,edge_width)
+        mask_edge_ur = (mask_feasible_ur.astype(float) -mask_feasible.astype(float))<0
+        mask_feasible_ul =  myroll2d(mask_feasible, -edge_width,-edge_width )
+        mask_edge_ul = (mask_feasible_ul.astype(float) -mask_feasible.astype(float))<0
+        mask_feasible_dr =  myroll2d(mask_feasible, edge_width,edge_width )
+        mask_edge_dr = (mask_feasible_dr.astype(float) -mask_feasible.astype(float))<0
+        mask_feasible_dl =  myroll2d(mask_feasible, edge_width,-edge_width)
+        mask_edge_ul = (mask_feasible_dl.astype(float) -mask_feasible.astype(float))<0
+        mask_edge = mask_union(
+            mask_edge_down, mask_edge_up, mask_edge_left, mask_edge_right, mask_edge_ur, mask_edge_ul, mask_edge_dr, mask_edge_ul
+        )
+        mask_feasible_edge = mask_intersection(mask_edge, mask_feasible)
+        vec_mask_feasible_edge = mask_feasible_edge.reshape(-1)
+        vec_mask_remain        = mask_remain.reshape(-1)
+        indvec_all = np.arange(width*height)
+        vec_region_partition= 0
+        for mask_idx, mask in enumerate(mask_list):
+            vec_region_partition += mask.reshape(-1) * mask_idx
+        vec_region_partition += mask_remain.reshape(-1) * mask_idx
+        # assert 0 in vec_region_partition
+        vec_ind_remain = np.nonzero(vec_mask_remain)[0]
+        vec_ind_feasible_edge = np.nonzero(vec_mask_feasible_edge)[0]
+        vec_x_remain = xmesh_vec[vec_ind_remain]
+        vec_y_remain = ymesh_vec[vec_ind_remain]
+        vec_x_feasible_edge =  xmesh_vec[vec_ind_feasible_edge]
+        vec_y_feasible_edge =  ymesh_vec[vec_ind_feasible_edge]
+        x_dis = vec_x_remain[:,np.newaxis] - vec_x_feasible_edge[np.newaxis,:]
+        y_dis = vec_y_remain[:,np.newaxis] - vec_y_feasible_edge[np.newaxis,:]
+        dis = x_dis **2 + y_dis **2
+        pos = np.argmin(dis, axis = 1)
+        nearest_point = vec_ind_feasible_edge[pos]   # closest point to target point
+        nearest_region = vec_region_partition[nearest_point]
+        nearest_region_set = set(nearest_region)
+        if edit_idx_list is not None:
+            for edit_idx in edit_idx_list:
+                assert edit_idx not in nearest_region
+        for midx, m in enumerate(mask_list):
+            if midx in nearest_region_set:
+                vec_newmask = np.zeros_like(indvec_all)
+                add_ind = vec_ind_remain [np.argwhere(nearest_region==midx)]
+                vec_newmask[add_ind] = 1
+                mask_list[midx] = mask_list[midx].astype(float)+ vec_newmask.reshape( mask_list[midx].shape).astype(float)
+                mask_list[midx] = mask_list[midx] > 0
+    print("Finish processing remaining mask, if you want to edit, launch the ui")
+    return mask_list, mask_remain
+def resize_mask(mask_np, resize_ratio = 1):
+    w, h = mask_np.shape[0],  mask_np.shape[1]
+    resized_w, resized_h = int(w*resize_ratio),int(h*resize_ratio)
+    mask_resized = torch.nn.functional.interpolate(torch.from_numpy(mask_np).unsqueeze(0).unsqueeze(0), (resized_w, resized_h)).squeeze()
+    mask = torch.zeros(w,  h)
+    if w > resized_w:
+        mask[:resized_w, :resized_h] = mask_resized
+    else:
+        assert h <= resized_h
+        mask = mask_resized[resized_w//2-w//2: resized_w//2-w//2+w, resized_h//2-h//2: resized_h//2-h//2+h]
+    return mask.cpu().numpy().astype(np.uint8)
+def process_mask_move_torch(
+        mask_list,
+        move_index_list,
+        delta_x_list = None,
+        delta_y_list = None,
+        edit_priority_list = None,
+        force_mask_remain = None,
+        resize_list = None
+    ):
+    mask_list_np = [m.cpu().numpy() for m in mask_list]
+    priority_list = [0 for _ in range(len(mask_list_np))]
+    for idx, (move_index, delta_x, delta_y, priority) in enumerate(zip(move_index_list, delta_x_list, delta_y_list, edit_priority_list)):
+        priority_list[move_index] = priority
+        if resize_list is not None:
+            mask = resize_mask (mask_list_np[move_index], resize_list[idx])
+        else:
+            mask = mask_list_np[move_index]
+        mask_list_np[move_index] = move_mask(mask,  delta_x = delta_x, delta_y = delta_y)
+    mask_list_np = stack_mask_with_priority (mask_list_np, priority_list, move_index_list) # exists blank
+    check_mask_overlap_numpy(*mask_list_np)
+    mask_list_np, mask_remain = process_remain_mask(mask_list_np, move_index_list,force_mask_remain)
+    mask_list = [torch.from_numpy(m).to( dtype=torch.uint8) for m in mask_list_np]
+    mask_remain = torch.from_numpy(mask_remain).to(dtype=torch.uint8)
+    return mask_list, mask_remain
+def process_mask_remove_torch(mask_list, remove_idx):
+    mask_list_np = [m.cpu().numpy() for m in mask_list]
+    mask_list_np[remove_idx] = np.zeros_like(mask_list_np[0])
+    mask_list_np, mask_remain = process_remain_mask(mask_list_np)
+    mask_list = [torch.from_numpy(m).to(dtype=torch.uint8) for m in mask_list_np]
+    mask_remain = torch.from_numpy(mask_remain).to(dtype=torch.uint8)
+    return mask_list, mask_remain
+def get_mask_difference_torch(mask_list1, mask_list2):
+    assert len(mask_list1) == len(mask_list2)
+    mask_diff = torch.zeros_like(mask_list1[0])
+    for mask1 , mask2 in zip(mask_list1, mask_list2):
+        diff = ((mask1.float() - mask2.float())!=0).to(torch.uint8)
+        mask_diff = mask_union_torch(mask_diff, diff)
+    return mask_diff
+def save_mask_list_to_npys(folder, mask_list, mask_label_list, name = "mask"):
+    for midx, (mask, mask_label) in enumerate(zip(mask_list, mask_label_list)):
+        np.save(os.path.join(folder, "{}{}_{}.npy".format(name, midx, mask_label)), mask)