Spaces:

jingyangcarl
/

docker_test29

Sleeping

App Files Files Community

jingyangcarl commited on Aug 12

Commit

5f58b04

1 Parent(s): 7589ff6

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +16 -0
.gitignore +7 -0
Dockerfile +56 -0
README.md +4 -4
app.py +80 -0
app_3d.py +21 -0
app_canny.py +83 -0
app_matnet.py +83 -0
app_sd.py +154 -0
app_texnet.py +259 -0
cv_utils.py +17 -0
depth_estimator.py +25 -0
environment.yml +7 -0
examples/bunny/frame_0001.png +3 -0
examples/bunny/mesh.obj +3 -0
examples/bunny/uv_normal.png +3 -0
examples/fighter/frame_0001.png +3 -0
examples/fighter/mesh.obj +3 -0
examples/fighter/uv_normal.png +3 -0
examples/highheel/frame_0001.png +3 -0
examples/highheel/mesh.obj +3 -0
examples/highheel/uv_normal.png +3 -0
examples/monkey/frame_0001.png +3 -0
examples/monkey/mesh.obj +3 -0
examples/monkey/uv_normal.png +3 -0
examples/tank/frame_0001.png +3 -0
examples/tank/mesh.obj +3 -0
examples/tank/uv_normal.png +3 -0
examples/tshirt/frame_0001.png +3 -0
examples/tshirt/mesh.obj +3 -0
examples/tshirt/uv_normal.png +3 -0
image_segmentor.py +33 -0
install.sh +18 -0
model.py +959 -0
preprocessor.py +120 -0
push_dataset.py +9 -0
rgb2x/generate_blend.py +142 -0
rgb2x/gradio_demo_rgb2x.py +157 -0
rgb2x/load_image.py +119 -0
rgb2x/pipeline_rgb2x.py +821 -0
run.sh +14 -0
settings.py +23 -0
text2tex/lib/__init__.py +0 -0
text2tex/lib/camera_helper.py +231 -0
text2tex/lib/constants.py +648 -0
text2tex/lib/diffusion_helper.py +189 -0
text2tex/lib/io_helper.py +78 -0
text2tex/lib/mesh_helper.py +148 -0
text2tex/lib/projection_helper.py +464 -0
text2tex/lib/render_helper.py +108 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+uv_normal.png filter=lfs diff=lfs merge=lfs -text
+mesh.obj filter=lfs diff=lfs merge=lfs -text
+frame_0001.png filter=lfs diff=lfs merge=lfs -textexamples/bunny/frame_0001.png filter=lfs diff=lfs merge=lfs -text
+examples/bunny/uv_normal.png filter=lfs diff=lfs merge=lfs -text
+examples/fighter/frame_0001.png filter=lfs diff=lfs merge=lfs -text
+examples/fighter/uv_normal.png filter=lfs diff=lfs merge=lfs -text
+examples/highheel/frame_0001.png filter=lfs diff=lfs merge=lfs -text
+examples/highheel/uv_normal.png filter=lfs diff=lfs merge=lfs -text
+examples/monkey/frame_0001.png filter=lfs diff=lfs merge=lfs -text
+examples/monkey/uv_normal.png filter=lfs diff=lfs merge=lfs -text
+examples/tank/frame_0001.png filter=lfs diff=lfs merge=lfs -text
+examples/tank/uv_normal.png filter=lfs diff=lfs merge=lfs -text
+examples/tshirt/frame_0001.png filter=lfs diff=lfs merge=lfs -text
+examples/tshirt/mesh.obj filter=lfs diff=lfs merge=lfs -text
+examples/tshirt/uv_normal.png filter=lfs diff=lfs merge=lfs -text
+examples/bunny/frame_0001.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__
+data
+# examples
+.gradio
+model_cache
+output
+test.png

Dockerfile ADDED Viewed

	@@ -0,0 +1,56 @@

+FROM continuumio/anaconda3:main
+# make sure cv2 can be loaded
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+WORKDIR /code
+COPY ./environment.yml /code/environment.yml
+# Create the environment using the environment.yml file
+RUN conda env create -f /code/environment.yml
+# install pip packages to the gradio environment
+# when adjusting the dockerfile on huggingface:
+# - if the dockerfile is successfully compileds, a new space need to be initialized and push the changes accordingly
+# - otherwise, you can commit to the failed build dockerfile for debugging
+RUN conda run -n gradio pip install --upgrade pip
+# RUN conda run -n gradio pip install diffusers["torch"] transformers accelerate xformers
+# RUN conda run -n gradio pip install gradio
+# RUN conda run -n gradio pip install controlnet-aux
+# RUN conda install -n gradio pytorch3d -c pytorch3d -c conda-forge
+# RUN conda install -n gradio -c conda-forge open-clip-torch pytorch-lightning
+# RUN conda run -n gradio pip install trimesh xatlas scikit-learn opencv-python omegaconf
+# Set the environment variable to use the gradio environment by default
+# RUN echo "source activate gradio" > ~/.bashrc
+# ENV PATH /opt/conda/envs/gradio/bin:$PATH
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+RUN conda create -n gradio-user python=3.11
+RUN conda run -n gradio-user pip install --upgrade pip
+# RUN conda install -n gradio-user pytorch3d=0.7.7 -c pytorch3d -c conda-forge
+# RUN conda install -n gradio-user -c conda-forge open-clip-torch pytorch-lightning
+RUN conda run -n gradio-user pip install diffusers transformers accelerate xformers controlnet-aux gradio spaces trimesh xatlas scikit-learn opencv-python matplotlib omegaconf
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PYTHONPATH=$HOME/app \
+    PYTHONUNBUFFERED=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    SYSTEM=spaces
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+# download https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_depth.pth?download=true to $HOME/app/text2tex/models/ControlNet/models/control_sd15_depth.pth
+RUN mkdir -p $HOME/app/text2tex/models/ControlNet/models && \
+    wget -O $HOME/app/text2tex/models/ControlNet/models/control_sd15_depth.pth https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_depth.pth?download=true
+CMD ["./run.sh"]

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Docker Test29
-emoji: 📉
-colorFrom: green
-colorTo: red
 sdk: docker
 pinned: false
 ---

 ---
+title: Docker Test6
+emoji: 👀
+colorFrom: pink
+colorTo: pink
 sdk: docker
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env python
+import gradio as gr
+import torch
+import sys
+pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
+version_str="".join([
+    f"py3{sys.version_info.minor}_cu",
+    torch.version.cuda.replace(".",""),
+    f"_pyt{pyt_version_str}"
+])
+print(f"Using version: {version_str}") # used to locate pytorch3d version in the requirements.txt for huggingface
+from app_canny import create_demo as create_demo_canny
+from app_texnet import create_demo as create_demo_texnet
+from model import Model
+from settings import ALLOW_CHANGING_BASE_MODEL, DEFAULT_MODEL_ID, SHOW_DUPLICATE_BUTTON
+DESCRIPTION = "# Material Authoring Demo v0.3"
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p> Check if the 'CUDA_VISIBLE_DEVICES' are set incorrectly in settings.py"
+# model = Model(base_model_id=DEFAULT_MODEL_ID, task_name="Canny")
+model = Model(base_model_id=DEFAULT_MODEL_ID, task_name="texnet")
+with gr.Blocks() as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=SHOW_DUPLICATE_BUTTON,
+    )
+    with gr.Tabs():
+        with gr.Tab("Texnet+Matnet"):
+            create_demo_texnet(model.process_texnet)
+    with gr.Accordion(label="Base model", open=False):
+        with gr.Row():
+            with gr.Column(scale=5):
+                current_base_model = gr.Text(label="Current base model")
+            with gr.Column(scale=1):
+                check_base_model_button = gr.Button("Check current base model")
+        with gr.Row():
+            with gr.Column(scale=5):
+                new_base_model_id = gr.Text(
+                    label="New base model",
+                    max_lines=1,
+                    placeholder="stable-diffusion-v1-5/stable-diffusion-v1-5",
+                    info="The base model must be compatible with Stable Diffusion v1.5.",
+                    interactive=ALLOW_CHANGING_BASE_MODEL,
+                )
+            with gr.Column(scale=1):
+                change_base_model_button = gr.Button("Change base model", interactive=ALLOW_CHANGING_BASE_MODEL)
+        if not ALLOW_CHANGING_BASE_MODEL:
+            gr.Markdown(
+                """The base model is not allowed to be changed in this Space so as not to slow down the demo, but it can be changed if you duplicate the Space."""
+            )
+    check_base_model_button.click(
+        fn=lambda: model.base_model_id,
+        outputs=current_base_model,
+        queue=False,
+        api_name="check_base_model",
+    )
+    gr.on(
+        triggers=[new_base_model_id.submit, change_base_model_button.click],
+        fn=model.set_base_model,
+        inputs=new_base_model_id,
+        outputs=current_base_model,
+        api_name=False,
+        concurrency_id="main",
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()

app_3d.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import gradio as gr
+import os
+def load_mesh(mesh_file_name):
+    return mesh_file_name
+demo = gr.Interface(
+    fn=load_mesh,
+    inputs=gr.Model3D(),
+    outputs=gr.Model3D(
+            clear_color=(255.0, 0.0, 0.0, 0.0),  label="3D Model", display_mode="wireframe"),
+    examples=[
+        [os.path.join(os.path.dirname(__file__), "examples/bunny/mesh.obj")],
+        [os.path.join(os.path.dirname(__file__), "examples/monkey/mesh.obj")],
+        [os.path.join(os.path.dirname(__file__), "examples/Bunny.obj")],
+    ],
+    cache_examples=True
+)
+if __name__ == "__main__":
+    demo.launch()

app_canny.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env python
+import gradio as gr
+from settings import (
+    DEFAULT_IMAGE_RESOLUTION,
+    DEFAULT_NUM_IMAGES,
+    MAX_IMAGE_RESOLUTION,
+    MAX_NUM_IMAGES,
+    MAX_SEED,
+)
+from utils import randomize_seed_fn
+def create_demo(process):
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                image = gr.Image()
+                prompt = gr.Textbox(label="Prompt", submit_btn=True)
+                with gr.Accordion("Advanced options", open=False):
+                    num_samples = gr.Slider(
+                        label="Number of images", minimum=1, maximum=MAX_NUM_IMAGES, value=DEFAULT_NUM_IMAGES, step=1
+                    )
+                    image_resolution = gr.Slider(
+                        label="Image resolution",
+                        minimum=256,
+                        maximum=MAX_IMAGE_RESOLUTION,
+                        value=DEFAULT_IMAGE_RESOLUTION,
+                        step=256,
+                    )
+                    canny_low_threshold = gr.Slider(
+                        label="Canny low threshold", minimum=1, maximum=255, value=100, step=1
+                    )
+                    canny_high_threshold = gr.Slider(
+                        label="Canny high threshold", minimum=1, maximum=255, value=200, step=1
+                    )
+                    num_steps = gr.Slider(label="Number of steps", minimum=1, maximum=100, value=20, step=1)
+                    guidance_scale = gr.Slider(label="Guidance scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                    seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
+                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                    a_prompt = gr.Textbox(label="Additional prompt", value="best quality, extremely detailed")
+                    n_prompt = gr.Textbox(
+                        label="Negative prompt",
+                        value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
+                    )
+            with gr.Column():
+                result = gr.Gallery(label="Output", show_label=False, columns=2, object_fit="scale-down")
+        inputs = [
+            image,
+            prompt,
+            a_prompt,
+            n_prompt,
+            num_samples,
+            image_resolution,
+            num_steps,
+            guidance_scale,
+            seed,
+            canny_low_threshold,
+            canny_high_threshold,
+        ]
+        prompt.submit(
+            fn=randomize_seed_fn,
+            inputs=[seed, randomize_seed],
+            outputs=seed,
+            queue=False,
+            api_name=False,
+        ).then(
+            fn=process,
+            inputs=inputs,
+            outputs=result,
+            api_name="canny",
+            concurrency_id="main",
+        )
+    return demo
+if __name__ == "__main__":
+    from model import Model
+    model = Model(task_name="Canny")
+    demo = create_demo(model.process_canny)
+    demo.queue().launch()

app_matnet.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env python
+import gradio as gr
+from settings import (
+    DEFAULT_IMAGE_RESOLUTION,
+    DEFAULT_NUM_IMAGES,
+    MAX_IMAGE_RESOLUTION,
+    MAX_NUM_IMAGES,
+    MAX_SEED,
+)
+from utils import randomize_seed_fn
+def create_demo(process):
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                image = gr.Image()
+                prompt = gr.Textbox(label="Prompt", submit_btn=True)
+                with gr.Accordion("Advanced options", open=False):
+                    num_samples = gr.Slider(
+                        label="Number of images", minimum=1, maximum=MAX_NUM_IMAGES, value=DEFAULT_NUM_IMAGES, step=1
+                    )
+                    image_resolution = gr.Slider(
+                        label="Image resolution",
+                        minimum=256,
+                        maximum=MAX_IMAGE_RESOLUTION,
+                        value=DEFAULT_IMAGE_RESOLUTION,
+                        step=256,
+                    )
+                    canny_low_threshold = gr.Slider(
+                        label="Canny low threshold", minimum=1, maximum=255, value=100, step=1
+                    )
+                    canny_high_threshold = gr.Slider(
+                        label="Canny high threshold", minimum=1, maximum=255, value=200, step=1
+                    )
+                    num_steps = gr.Slider(label="Number of steps", minimum=1, maximum=100, value=20, step=1)
+                    guidance_scale = gr.Slider(label="Guidance scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                    seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
+                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                    a_prompt = gr.Textbox(label="Additional prompt", value="best quality, extremely detailed")
+                    n_prompt = gr.Textbox(
+                        label="Negative prompt",
+                        value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
+                    )
+            with gr.Column():
+                result = gr.Gallery(label="Output", show_label=False, columns=2, object_fit="scale-down")
+        inputs = [
+            image,
+            prompt,
+            a_prompt,
+            n_prompt,
+            num_samples,
+            image_resolution,
+            num_steps,
+            guidance_scale,
+            seed,
+            canny_low_threshold,
+            canny_high_threshold,
+        ]
+        prompt.submit(
+            fn=randomize_seed_fn,
+            inputs=[seed, randomize_seed],
+            outputs=seed,
+            queue=False,
+            api_name=False,
+        ).then(
+            fn=process,
+            inputs=inputs,
+            outputs=result,
+            api_name="canny",
+            concurrency_id="main",
+        )
+    return demo
+if __name__ == "__main__":
+    from model import Model
+    model = Model(task_name="Canny")
+    demo = create_demo(model.process_canny)
+    demo.queue().launch()

app_sd.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import gradio as gr
+import numpy as np
+import random
+import spaces #[uncomment to use ZeroGPU]
+from diffusers import DiffusionPipeline
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
+if torch.cuda.is_available():
+    torch_dtype = torch.float16
+else:
+    torch_dtype = torch.float32
+pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
+pipe = pipe.to(device)
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1024
+@spaces.GPU #[uncomment to use ZeroGPU]
+def infer(
+    prompt,
+    negative_prompt,
+    seed,
+    randomize_seed,
+    width,
+    height,
+    guidance_scale,
+    num_inference_steps,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator().manual_seed(seed)
+    image = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        width=width,
+        height=height,
+        generator=generator,
+    ).images[0]
+    return image, seed
+examples = [
+    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+    "An astronaut riding a green horse",
+    "A delicious ceviche cheesecake slice",
+]
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 640px;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown(" # Text-to-Image Gradio Template")
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run", scale=0, variant="primary")
+        result = gr.Image(label="Result", show_label=False)
+        with gr.Accordion("Advanced Settings", open=False):
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=False,
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row():
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,  # Replace with defaults that work for your model
+                )
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,  # Replace with defaults that work for your model
+                )
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance scale",
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=0.0,  # Replace with defaults that work for your model
+                )
+                num_inference_steps = gr.Slider(
+                    label="Number of inference steps",
+                    minimum=1,
+                    maximum=50,
+                    step=1,
+                    value=2,  # Replace with defaults that work for your model
+                )
+        gr.Examples(examples=examples, inputs=[prompt])
+    gr.on(
+        triggers=[run_button.click, prompt.submit],
+        fn=infer,
+        inputs=[
+            prompt,
+            negative_prompt,
+            seed,
+            randomize_seed,
+            width,
+            height,
+            guidance_scale,
+            num_inference_steps,
+        ],
+        outputs=[result, seed],
+    )
+if __name__ == "__main__":
+    demo.launch()

app_texnet.py ADDED Viewed

	@@ -0,0 +1,259 @@

+#!/usr/bin/env python
+import os
+import shutil
+import tempfile
+import gradio as gr
+from PIL import Image
+import numpy as np
+from settings import (
+    DEFAULT_IMAGE_RESOLUTION,
+    DEFAULT_NUM_IMAGES,
+    MAX_IMAGE_RESOLUTION,
+    MAX_NUM_IMAGES,
+    MAX_SEED,
+)
+from utils import randomize_seed_fn
+# ---- helper to build a quick textured copy of the mesh ---------------
+def apply_texture(src_mesh:str, texture:str, tag:str)->str:
+    """
+    Writes a copy of `src_mesh` and tiny .mtl that points to `texture`.
+    Returns the new OBJ/GLB path for viewing.
+    """
+    tmp_dir = tempfile.mkdtemp()
+    mesh_copy = os.path.join(tmp_dir, f"{tag}.obj")
+    mtl_name  = f"{tag}.mtl"
+    # copy geometry
+    shutil.copy(src_mesh, mesh_copy)
+    # write minimal MTL
+    with open(os.path.join(tmp_dir, mtl_name), "w") as f:
+        f.write(f"newmtl material_0\nmap_Kd {os.path.basename(texture)}\n")
+    # ensure texture lives next to OBJ
+    shutil.copy(texture, os.path.join(tmp_dir, os.path.basename(texture)))
+    # patch OBJ to reference our new MTL
+    with open(mesh_copy, "r+") as f:
+        lines = f.readlines()
+        if not lines[0].startswith("mtllib"):
+            lines.insert(0, f"mtllib {mtl_name}\n")
+            f.seek(0); f.writelines(lines)
+    return mesh_copy
+def image_to_temp_path(img_like, tag, out_dir=None):
+    """
+    Convert various image-like objects (str, PIL.Image, list, tuple) to temp PNG path.
+    Returns the path to the saved image file.
+    """
+    # Handle tuple or list input
+    if isinstance(img_like, (list, tuple)):
+        if len(img_like) == 0:
+            raise ValueError("Empty image list/tuple.")
+        img_like = img_like[0]
+    # If it's already a file path
+    if isinstance(img_like, str):
+        return img_like
+    # If it's a PIL Image
+    if isinstance(img_like, Image.Image):
+        temp_path = os.path.join(tempfile.mkdtemp() if out_dir is None else out_dir, f"{tag}.png")
+        os.makedirs(os.path.dirname(temp_path), exist_ok=True)
+        img_like.save(temp_path)
+        return temp_path
+    # if it's numpy array
+    if isinstance(img_like, np.ndarray):
+        temp_path = os.path.join(tempfile.mkdtemp() if out_dir is None else out_dir, f"{tag}.png")
+        os.makedirs(os.path.dirname(temp_path), exist_ok=True)
+        img_like = Image.fromarray(img_like)
+        img_like.save(temp_path)
+        return temp_path
+    raise ValueError(f"Expected PIL.Image, str, list, or tuple — got {type(img_like)}")
+def show_mesh(which, mesh, inp, coarse, fine):
+    """Switch the displayed texture based on dropdown change."""
+    print()
+    tex_map = {
+        "Input": image_to_temp_path(inp, "input"),
+        "Coarse": coarse[0] if isinstance(coarse, tuple) else coarse,
+        "Fine": fine[0] if isinstance(fine, tuple) else fine,
+    }
+    texture_path = tex_map[which]
+    return apply_texture(mesh, texture_path, which.lower())
+# ----------------------------------------------------------------------
+def create_demo(process):
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("## Select preset from the example list, and modify the prompt accordingly")
+                with gr.Row():
+                    name = gr.Textbox(label="Name", interactive=False, visible=False)
+                    representative = gr.Image(label="Geometry", interactive=False)
+                    image = gr.Image(label="UV Normal", interactive=False)
+                prompt = gr.Textbox(label="Prompt", submit_btn=True)
+                with gr.Accordion("Advanced options", open=False):
+                    num_samples = gr.Slider(
+                        label="Number of images", minimum=1, maximum=MAX_NUM_IMAGES, value=DEFAULT_NUM_IMAGES, step=1
+                    )
+                    image_resolution = gr.Slider(
+                        label="Image resolution",
+                        minimum=256,
+                        maximum=MAX_IMAGE_RESOLUTION,
+                        value=DEFAULT_IMAGE_RESOLUTION,
+                        step=256,
+                    )
+                    num_steps = gr.Slider(label="Number of steps", minimum=1, maximum=100, value=10, step=1)
+                    guidance_scale = gr.Slider(label="Guidance scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                    seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
+                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                    a_prompt = gr.Textbox(label="Additional prompt", value="best quality, extremely detailed")
+                    n_prompt = gr.Textbox(
+                        label="Negative prompt",
+                        value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
+                    )
+            with gr.Column():
+                # 2x2 grid of images for the output textures
+                gr.Markdown("### Output BRDF")
+                with gr.Row():
+                    base_color = gr.Gallery(label="Base Color", show_label=True, columns=1, object_fit="scale-down")
+                    normal = gr.Gallery(label="Displacement Map", show_label=True, columns=1, object_fit="scale-down")
+                with gr.Row():
+                    roughness = gr.Gallery(label="Roughness Map", show_label=True, columns=1, object_fit="scale-down")
+                    metallic = gr.Gallery(label="Metallic Map", show_label=True, columns=1, object_fit="scale-down")
+                gr.Markdown("### Download Packed Blender Files for 3D Visualization")
+                out_blender_path = gr.File(label="Generated Blender File", file_types=[".blend"])
+        inputs = [
+            name,  # Name of the object
+            representative,  # Geometry mesh
+            image,
+            prompt,
+            a_prompt,
+            n_prompt,
+            num_samples,
+            image_resolution,
+            num_steps,
+            guidance_scale,
+            seed,
+        ]
+        # first call → run diffusion / texture network
+        prompt.submit(
+            fn=randomize_seed_fn,
+            inputs=[seed, randomize_seed],
+            outputs=seed,
+            queue=False,
+            api_name=False,
+        ).then(
+            fn=process,
+            inputs=inputs,
+            outputs=[base_color, normal, roughness, metallic, out_blender_path],
+            api_name="canny",
+            concurrency_id="main",
+        )
+        gr.Examples(
+            fn=process,
+            inputs=inputs,
+            outputs=[base_color, normal, roughness, metallic],
+            examples=[
+                [
+                    "bunny",
+                    "examples/bunny/frame_0001.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/bunny/uv_normal/fused.png
+                    "examples/bunny/uv_normal.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/bunny/uv_normal/fused.png
+                    "feather",
+                    a_prompt.value,
+                    n_prompt.value,
+                    num_samples.value,
+                    image_resolution.value,
+                    num_steps.value,
+                    guidance_scale.value,
+                    seed.value,
+                ],
+                [
+                    "monkey",
+                    "examples/monkey/frame_0001.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                    "examples/monkey/uv_normal.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                    "wood",
+                    a_prompt.value,
+                    n_prompt.value,
+                    num_samples.value,
+                    image_resolution.value,
+                    num_steps.value,
+                    guidance_scale.value,
+                    seed.value,
+                ],
+                [
+                    "tshirt",
+                    "examples/tshirt/frame_0001.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                    "examples/tshirt/uv_normal.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                    "wood",
+                    a_prompt.value,
+                    n_prompt.value,
+                    num_samples.value,
+                    image_resolution.value,
+                    num_steps.value,
+                    guidance_scale.value,
+                    seed.value,
+                ],
+                # [
+                #     "highheel",
+                #     "examples/highheel/frame_0001.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                #     "examples/highheel/uv_normal.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                #     "wood",
+                #     a_prompt.value,
+                #     n_prompt.value,
+                #     num_samples.value,
+                #     image_resolution.value,
+                #     num_steps.value,
+                #     guidance_scale.value,
+                #     seed.value,
+                # ],
+                [
+                    "tank",
+                    "examples/tank/frame_0001.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                    "examples/tank/uv_normal.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                    "wood",
+                    a_prompt.value,
+                    n_prompt.value,
+                    num_samples.value,
+                    image_resolution.value,
+                    num_steps.value,
+                    guidance_scale.value,
+                    seed.value,
+                ],
+                [
+                    "fighter",
+                    "examples/fighter/frame_0001.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                    "examples/fighter/uv_normal.png", # /dgxusers/Users/jyang/project/ObjectReal/data/control/preprocess/monkey/uv_normal/fused.png
+                    "wood",
+                    a_prompt.value,
+                    n_prompt.value,
+                    num_samples.value,
+                    image_resolution.value,
+                    num_steps.value,
+                    guidance_scale.value,
+                    seed.value,
+                ],
+            ],
+        )
+    return demo
+if __name__ == "__main__":
+    from model import Model
+    model = Model(task_name="Texnet")
+    demo = create_demo(model.process_texnet)
+    demo.queue().launch()

cv_utils.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import cv2
+import numpy as np
+def resize_image(input_image, resolution, interpolation=None):
+    H, W, C = input_image.shape
+    H = float(H)
+    W = float(W)
+    k = float(resolution) / max(H, W)
+    H *= k
+    W *= k
+    H = int(np.round(H / 64.0)) * 64
+    W = int(np.round(W / 64.0)) * 64
+    if interpolation is None:
+        interpolation = cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA
+    img = cv2.resize(input_image, (W, H), interpolation=interpolation)
+    return img

depth_estimator.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import numpy as np
+import PIL.Image
+from controlnet_aux.util import HWC3
+from transformers import pipeline
+from cv_utils import resize_image
+class DepthEstimator:
+    def __init__(self):
+        self.model = pipeline("depth-estimation")
+    def __call__(self, image: np.ndarray, **kwargs) -> PIL.Image.Image:
+        detect_resolution = kwargs.pop("detect_resolution", 512)
+        image_resolution = kwargs.pop("image_resolution", 512)
+        image = np.array(image)
+        image = HWC3(image)
+        image = resize_image(image, resolution=detect_resolution)
+        image = PIL.Image.fromarray(image)
+        image = self.model(image)
+        image = image["depth"]
+        image = np.array(image)
+        image = HWC3(image)
+        image = resize_image(image, resolution=image_resolution)
+        return PIL.Image.fromarray(image)

environment.yml ADDED Viewed

	@@ -0,0 +1,7 @@

+name: gradio
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.11
+  - gradio

examples/bunny/frame_0001.png ADDED Viewed

Git LFS Details

SHA256: 7fedab3e148faac233d7eba7b2ab92f02998b8b3ba6a3ab1e3b823f1fdedf51b
Pointer size: 131 Bytes
Size of remote file: 468 kB

examples/bunny/mesh.obj ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b6262e2b5563901d38599a08926ac57449b7b6c0c42a0c9b724154cde282799
+size 6044863

examples/bunny/uv_normal.png ADDED Viewed

Git LFS Details

SHA256: 03e7c7aa7f14454b3b179aa4c5c30863e5c74c67fd858f8ea6c28e93630ecec0
Pointer size: 132 Bytes
Size of remote file: 2.3 MB

examples/fighter/frame_0001.png ADDED Viewed

Git LFS Details

SHA256: 2ffaa00d5cd340167e7b13d0dd986dc6a6680c4b91595eec8d27d384f6670df7
Pointer size: 131 Bytes
Size of remote file: 423 kB

examples/fighter/mesh.obj ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04c809af1ea9dadbea30261e0a8eef6b13735969e6b9e7d4e7423950072bc095
+size 1576167

examples/fighter/uv_normal.png ADDED Viewed

Git LFS Details

SHA256: 46d4c010107c4fa030ead5ca1b8ca66ade255bcb4194dc30e9f1195bba2da672
Pointer size: 131 Bytes
Size of remote file: 753 kB

examples/highheel/frame_0001.png ADDED Viewed

Git LFS Details

SHA256: 9b9b91e5f99c06dd11372aa2ccb44cb996bc103ea19eacb94ee5611478c831b8
Pointer size: 131 Bytes
Size of remote file: 490 kB

examples/highheel/mesh.obj ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1cb43ff640727c280221148c4047b9c3df2da6033b421a3cfa6d729848a128d7
+size 8394487

examples/highheel/uv_normal.png ADDED Viewed

Git LFS Details

SHA256: 52cf0dd687067109f160ba5015078077a9a6187c09305af5c57eec3c3d05c885
Pointer size: 132 Bytes
Size of remote file: 1.57 MB

examples/monkey/frame_0001.png ADDED Viewed

Git LFS Details

SHA256: da466a3467871077c00e6ab2a6d105afb837f6372cf48867a31dd86ca6fdd157
Pointer size: 131 Bytes
Size of remote file: 491 kB

examples/monkey/mesh.obj ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a49e7eef70eb55f7de5eab9615e981194db4cc0b1195bc8270d833aaa6047ac
+size 6601492

examples/monkey/uv_normal.png ADDED Viewed

Git LFS Details

SHA256: f3710880d4777042bb2838fa01988e653ad5d932ac8ba5a817eb13869902ba03
Pointer size: 132 Bytes
Size of remote file: 2.03 MB

examples/tank/frame_0001.png ADDED Viewed

Git LFS Details

SHA256: 58a1cd1df7b94ad52568952a46f6b9cf57d62c81290cca1c967250af7a15316b
Pointer size: 131 Bytes
Size of remote file: 512 kB

examples/tank/mesh.obj ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:301633de1a7757f78a6f67abb6e61bcc8e6a01f5a54a8582d1943ad0ad943211
+size 6942253

examples/tank/uv_normal.png ADDED Viewed

Git LFS Details

SHA256: 9a7d1d168addc29d7953ea222cfb125b2d802188747e800b8e63dd686bcf9c06
Pointer size: 132 Bytes
Size of remote file: 6.13 MB

examples/tshirt/frame_0001.png ADDED Viewed

Git LFS Details

SHA256: 31f2e5239afc351695d176aeeefee23359f43b4c4b4fe40a1793bb9ccb80464b
Pointer size: 131 Bytes
Size of remote file: 496 kB

examples/tshirt/mesh.obj ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7c6c9bdec8d646a1980e5b987a1182c92af84cc945ef49c1735d4337185d3e5
+size 39275876

examples/tshirt/uv_normal.png ADDED Viewed

Git LFS Details

SHA256: ee7f1df0f853fab91acdaf0240a6bf1444d8db56c310fe30efc6c98cc18c36c9
Pointer size: 132 Bytes
Size of remote file: 2.17 MB

image_segmentor.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import cv2
+import numpy as np
+import PIL.Image
+import torch
+from controlnet_aux.util import HWC3, ade_palette
+from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
+from cv_utils import resize_image
+class ImageSegmentor:
+    def __init__(self):
+        self.image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
+        self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")
+    @torch.inference_mode()
+    def __call__(self, image: np.ndarray, **kwargs) -> PIL.Image.Image:
+        detect_resolution = kwargs.pop("detect_resolution", 512)
+        image_resolution = kwargs.pop("image_resolution", 512)
+        image = HWC3(image)
+        image = resize_image(image, resolution=detect_resolution)
+        image = PIL.Image.fromarray(image)
+        pixel_values = self.image_processor(image, return_tensors="pt").pixel_values
+        outputs = self.image_segmentor(pixel_values)
+        seg = self.image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(ade_palette()):
+            color_seg[seg == label, :] = color
+        color_seg = color_seg.astype(np.uint8)
+        color_seg = resize_image(color_seg, resolution=image_resolution, interpolation=cv2.INTER_NEAREST)
+        return PIL.Image.fromarray(color_seg)

install.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+eval "$(conda shell.bash hook)"
+# conda activate base
+# conda remove -n matgen-plus --all
+conda create -n matgen-plus python=3.11
+conda activate matgen-plus
+pip install diffusers["torch"] transformers accelerate xformers
+pip install gradio
+pip install controlnet-aux
+# text2tex
+conda install pytorch3d -c pytorch -c conda-forge
+conda install -c conda-forge open-clip-torch pytorch-lightning
+pip install trimesh xatlas scikit-learn opencv-python omegaconf
+python app.py

model.py ADDED Viewed

	@@ -0,0 +1,959 @@

+import gc
+# get socket and check if the name is vgldgx01
+import socket
+if socket.gethostname() != "vgldgx01":
+    import spaces #[uncomment to use ZeroGPU]
+import numpy as np
+import PIL.Image
+import torch
+from controlnet_aux.util import HWC3
+from diffusers import (
+    ControlNetModel,
+    DiffusionPipeline,
+    StableDiffusionControlNetPipeline,
+    StableDiffusionImg2ImgPipeline,
+    UniPCMultistepScheduler,
+    DDIMScheduler, #rgb2x
+)
+import torchvision
+from torchvision import transforms
+from cv_utils import resize_image
+from preprocessor import Preprocessor
+from settings import MAX_IMAGE_RESOLUTION, MAX_NUM_IMAGES
+from tqdm.auto import tqdm
+import subprocess
+from rgb2x.pipeline_rgb2x import StableDiffusionAOVMatEstPipeline
+from app_texnet import image_to_temp_path
+import os
+import time
+import tempfile
+from text2tex.scripts.generate_texture import text2tex_call, init_args
+from glob import glob
+CONTROLNET_MODEL_IDS = {
+    # "Openpose": "lllyasviel/control_v11p_sd15_openpose",
+    # "Canny": "lllyasviel/control_v11p_sd15_canny",
+    # "MLSD": "lllyasviel/control_v11p_sd15_mlsd",
+    # "scribble": "lllyasviel/control_v11p_sd15_scribble",
+    # "softedge": "lllyasviel/control_v11p_sd15_softedge",
+    # "segmentation": "lllyasviel/control_v11p_sd15_seg",
+    # "depth": "lllyasviel/control_v11f1p_sd15_depth",
+    # "NormalBae": "lllyasviel/control_v11p_sd15_normalbae",
+    # "lineart": "lllyasviel/control_v11p_sd15_lineart",
+    # "lineart_anime": "lllyasviel/control_v11p_sd15s2_lineart_anime",
+    # "shuffle": "lllyasviel/control_v11e_sd15_shuffle",
+    # "ip2p": "lllyasviel/control_v11e_sd15_ip2p",
+    # "inpaint": "lllyasviel/control_v11e_sd15_inpaint",
+    # "texnet": "/home/jyang/projects/ObjectReal/logs/train_texnet_deploy/checkpoint-55000/controlnet" # load and call
+    "texnet": "jingyangcarl/texnet",
+}
+def download_all_controlnet_weights() -> None:
+    for model_id in CONTROLNET_MODEL_IDS.values():
+        ControlNetModel.from_pretrained(model_id)
+class Model:
+    def __init__(
+        self, base_model_id: str = "stable-diffusion-v1-5/stable-diffusion-v1-5", task_name: str = "Canny"
+    ) -> None:
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.base_model_id = ""
+        self.task_name = ""
+        self.pipe = self.load_pipe(base_model_id, task_name)
+        self.pipe_base = StableDiffusionImg2ImgPipeline.from_pretrained(
+            'runwayml/stable-diffusion-v1-5', safety_checker=None, torch_dtype=torch.float16
+        ).to(self.device)
+        self.preprocessor = Preprocessor()
+        # set up pipe_rgb2x
+        self.pipe_rgb2x = StableDiffusionAOVMatEstPipeline.from_pretrained(
+            "zheng95z/rgb-to-x",
+            torch_dtype=torch.float16,
+        ).to(self.device)
+        self.pipe_rgb2x.scheduler = DDIMScheduler.from_config(
+            self.pipe_rgb2x.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
+        )
+        self.pipe_rgb2x.set_progress_bar_config(disable=True)
+        # setup blender
+        self.blender_path = '/tmp/blender-3.2.2-linux-x64/blender'
+        if not os.path.exists(self.blender_path):
+            print("Downloading Blender...")
+            subprocess.run(["wget", "https://download.blender.org/release/Blender3.2/blender-3.2.2-linux-x64.tar.xz", "-O", "/tmp/blender-3.2.2-linux-x64.tar.xz"], check=True)
+            subprocess.run(["tar", "-xf", "/tmp/blender-3.2.2-linux-x64.tar.xz", "-C", "/tmp"], check=True)
+            print("Blender downloaded and extracted.")
+    def load_pipe(self, base_model_id: str, task_name: str) -> DiffusionPipeline:
+        if (
+            base_model_id == self.base_model_id
+            and task_name == self.task_name
+            and hasattr(self, "pipe")
+            and self.pipe is not None
+        ):
+            return self.pipe
+        model_id = CONTROLNET_MODEL_IDS[task_name]
+        controlnet = ControlNetModel.from_pretrained(model_id, torch_dtype=torch.float16)
+        to_upload = False
+        if to_upload:
+            # confirm before uploading
+            confirm = input(f"Do you want to upload {model_id} to the hub? (y/n): ")
+            if confirm.lower() == "y":
+                controlnet.push_to_hub("jingyangcarl/texnet")
+            else:
+                print("Upload cancelled.")
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            base_model_id, safety_checker=None, controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe.to(self.device)
+        if self.device.type == "cuda":
+            import os
+            if os.environ.get("SPACES_ZERO_GPU", "0") == "1":
+                # when running on ZeroGPU, enable CPU offload
+                # pipe.enable_xformers_memory_efficient_attention() doens't work
+                # pipe.enable_model_cpu_offload()
+                pass
+            else:
+                pipe.enable_xformers_memory_efficient_attention()
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.base_model_id = base_model_id
+        self.task_name = task_name
+        return pipe
+    def set_base_model(self, base_model_id: str) -> str:
+        if not base_model_id or base_model_id == self.base_model_id:
+            return self.base_model_id
+        del self.pipe
+        torch.cuda.empty_cache()
+        gc.collect()
+        try:
+            self.pipe = self.load_pipe(base_model_id, self.task_name)
+        except Exception:  # noqa: BLE001
+            self.pipe = self.load_pipe(self.base_model_id, self.task_name)
+        return self.base_model_id
+    def load_controlnet_weight(self, task_name: str) -> None:
+        if task_name == self.task_name:
+            return
+        if self.pipe is not None and hasattr(self.pipe, "controlnet"):
+            del self.pipe.controlnet
+        torch.cuda.empty_cache()
+        gc.collect()
+        model_id = CONTROLNET_MODEL_IDS[task_name]
+        controlnet = ControlNetModel.from_pretrained(model_id, torch_dtype=torch.float16)
+        controlnet.to(self.device)
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.pipe.controlnet = controlnet
+        self.task_name = task_name
+    def get_prompt(self, prompt: str, additional_prompt: str) -> str:
+        return additional_prompt if not prompt else f"{prompt}, {additional_prompt}"
+    # @spaces.GPU #[uncomment to use ZeroGPU]
+    @torch.autocast("cuda")
+    def run_pipe(
+        self,
+        prompt: str,
+        negative_prompt: str,
+        control_image: PIL.Image.Image,
+        num_images: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+    ) -> list[PIL.Image.Image]:
+        generator = torch.Generator().manual_seed(seed)
+        # self.pipe.to(self.device)
+        return self.pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images,
+            num_inference_steps=num_steps,
+            generator=generator,
+            image=control_image,
+        ).images
+    # @spaces.GPU #[uncomment to use ZeroGPU]
+    @torch.inference_mode()
+    def process_texnet(
+        self,
+        obj_name: str,
+        represented_image: np.ndarray | None, # not used
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        low_threshold: int,
+        high_threshold: int,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        prompt_nospace = prompt.replace(' ', '_')
+        # self.preprocessor.load("texnet")
+        # control_image = self.preprocessor(
+        #     image=image, low_threshold=low_threshold, high_threshold=high_threshold, image_resolution=image_resolution, output_type="pil"
+        # )
+        # self.load_controlnet_weight("texnet")
+        # tex_coarse = self.run_pipe(
+        #     prompt=self.get_prompt(prompt, additional_prompt),
+        #     negative_prompt=negative_prompt,
+        #     control_image=control_image,
+        #     num_images=num_images,
+        #     num_steps=num_steps,
+        #     guidance_scale=guidance_scale,
+        #     seed=seed,
+        # )
+        # # use img2img pipeline
+        # self.pipe_backup = self.pipe
+        # self.pipe = self.pipe_base
+        # # refine
+        tex_fine = []
+        mesh_fine = []
+        # for result_coarse in tex_coarse:
+        #     # clean up GPU cache
+        #     torch.cuda.empty_cache()
+        #     gc.collect()
+        #     # masking
+        #     mask = (np.array(control_image).sum(axis=-1) == 0)[...,None]
+        #     image_masked = PIL.Image.fromarray(np.where(mask, control_image, result_coarse))
+        #     image_blurry = transforms.GaussianBlur(kernel_size=5, sigma=1)(image_masked)
+        #     result_fine = self.run_pipe(
+        #         # prompt=prompt,
+        #         prompt=self.get_prompt(prompt, additional_prompt),
+        #         negative_prompt=negative_prompt,
+        #         control_image=image_blurry,
+        #         num_images=1,
+        #         num_steps=num_steps,
+        #         guidance_scale=guidance_scale,
+        #         seed=seed,
+        #     )[0]
+        #     result_fine = PIL.Image.fromarray(np.where(mask, control_image, result_fine))
+        #     tex_fine.append(result_fine)
+        temp_out_path = tempfile.mkdtemp()
+        temp_out_path = 'output'
+        # put text2tex here,
+        args = init_args()
+        args.input_dir = f'examples/{obj_name}/'
+        args.output_dir = os.path.join(temp_out_path, f'{obj_name}/{prompt_nospace}')
+        args.obj_name = obj_name
+        args.obj_file = 'mesh.obj'
+        args.prompt = f'{prompt} {obj_name}'
+        args.add_view_to_prompt = True
+        args.ddim_steps = 5
+        # args.ddim_steps = 50
+        args.new_strength = 1.0
+        args.update_strength = 0.3
+        args.view_threshold = 0.1
+        args.blend = 0
+        args.dist = 1
+        args.num_viewpoints = 2
+        # args.num_viewpoints = 36
+        args.viewpoint_mode = 'predefined'
+        args.use_principle = True
+        args.update_steps = 2
+        # args.update_steps = 20
+        args.update_mode = 'heuristic'
+        args.seed = 42
+        args.post_process = True
+        args.device = '2080'
+        args.uv_size = 1000
+        args.image_size = 512
+        # args.image_size = 768
+        args.use_objaverse = True  # assume the mesh is normalized with y-axis as up
+        output_dir = text2tex_call(args)
+        # get the texture and mesh with underscore '_post', which is the id of the last mesh, should be good for the visual
+        post_idx = glob(os.path.join(output_dir, 'update', 'mesh', "*_post.png"))[0].split('/')[-1].split('_')[0]
+        tex_fine.append(PIL.Image.open(os.path.join(output_dir, 'update', 'mesh', f"{post_idx}.png")).convert("RGB"))
+        mesh_fine.append(os.path.join(output_dir, 'update', 'mesh', f"{post_idx}.obj"))
+        torch.cuda.empty_cache()
+        # restore the original pipe
+        # self.pipe = self.pipe_backup
+        # use rgb2x for now for generating the texture
+        def rgb2x(
+            pipeline,
+            photo,
+            inference_step = 50,
+            num_samples = 1,
+        ):
+            generator = torch.Generator(device="cuda").manual_seed(seed)
+            # Check if the width and height are multiples of 8. If not, crop it using torchvision.transforms.CenterCrop
+            old_height = photo.shape[1]
+            old_width = photo.shape[2]
+            new_height = old_height
+            new_width = old_width
+            radio = old_height / old_width
+            max_side = 1000
+            if old_height > old_width:
+                new_height = max_side
+                new_width = int(new_height / radio)
+            else:
+                new_width = max_side
+                new_height = int(new_width * radio)
+            if new_width % 8 != 0 or new_height % 8 != 0:
+                new_width = new_width // 8 * 8
+                new_height = new_height // 8 * 8
+            photo = torchvision.transforms.Resize((new_height, new_width))(photo)
+            required_aovs = ["albedo", "normal", "roughness", "metallic", "irradiance"]
+            prompts = {
+                "albedo": "Albedo (diffuse basecolor)",
+                "normal": "Camera-space Normal",
+                "roughness": "Roughness",
+                "metallic": "Metallicness",
+                "irradiance": "Irradiance (diffuse lighting)",
+            }
+            return_list = []
+            for i in tqdm(range(num_samples), desc="Running Pipeline", leave=False):
+                for aov_name in required_aovs:
+                    prompt = prompts[aov_name]
+                    generated_image = pipeline(
+                        prompt=prompt,
+                        photo=photo,
+                        num_inference_steps=inference_step,
+                        height=new_height,
+                        width=new_width,
+                        generator=generator,
+                        required_aovs=[aov_name],
+                    ).images[0][0]
+                    generated_image = torchvision.transforms.Resize(
+                        (old_height, old_width)
+                    )(generated_image)
+                    # generated_image = (generated_image, f"Generated {aov_name} {i}")
+                    # generated_image = (generated_image, f"{aov_name}")
+                    return_list.append(generated_image)
+            return photo, return_list, prompts
+        # Load rgb2x pipeline
+        _, preds, prompts = rgb2x(self.pipe_rgb2x, torchvision.transforms.PILToTensor()(tex_fine[0]).to(self.pipe.device), inference_step=num_steps, num_samples=num_images)
+        intrinsic_dir = os.path.join(output_dir, 'intrinsic')
+        use_text2tex = True
+        if use_text2tex:
+            base_color_path = image_to_temp_path(tex_fine[0], "base_color", out_dir=intrinsic_dir)
+            normal_map_path = image_to_temp_path(preds[0], "normal_map", out_dir=intrinsic_dir)
+            roughness_path = image_to_temp_path(preds[1], "roughness", out_dir=intrinsic_dir)
+            metallic_path = image_to_temp_path(preds[2], "metallic", out_dir=intrinsic_dir)
+        else:
+            base_color_path = image_to_temp_path(tex_fine[0].rotate(90), "base_color", out_dir=intrinsic_dir)
+            normal_map_path = image_to_temp_path(preds[0].rotate(90), "normal_map", out_dir=intrinsic_dir)
+            roughness_path = image_to_temp_path(preds[1].rotate(90), "roughness", out_dir=intrinsic_dir)
+            metallic_path = image_to_temp_path(preds[2].rotate(90), "metallic", out_dir=intrinsic_dir)
+        current_timecode = time.strftime("%Y%m%d_%H%M%S")
+        # output_blend_path = os.path.join(os.getcwd(), "output", f"{obj_name}_{prompt_nospace}_{current_timecode}.blend")  # replace with desired output path
+        output_blend_path = os.path.join(tempfile.mkdtemp(), f"{obj_name}_{prompt_nospace}_{current_timecode}.blend")  # replace with desired output path
+        os.makedirs(os.path.dirname(output_blend_path), exist_ok=True)
+        def run_blend_generation(
+            blender_path,
+            generate_script_path,
+            obj_path,
+            base_color_path,
+            normal_map_path,
+            roughness_path,
+            metallic_path,
+            output_blend
+        ):
+            cmd = [
+                blender_path, "--background", "--python", generate_script_path, "--",
+                obj_path, base_color_path, normal_map_path, roughness_path, metallic_path, output_blend
+            ]
+            subprocess.run(cmd, check=True)
+        # check if the blender_path exists, if not download
+        run_blend_generation(
+            blender_path=self.blender_path,
+            generate_script_path="rgb2x/generate_blend.py",
+            # obj_path=f"examples/{obj_name}/mesh.obj",  # replace with actual mesh path
+            obj_path=mesh_fine[0],  # replace with actual mesh path
+            base_color_path=base_color_path,
+            normal_map_path=normal_map_path,
+            roughness_path=roughness_path,
+            metallic_path=metallic_path,
+            output_blend=output_blend_path  # replace with desired output path
+        )
+        # gallary
+        return [*tex_fine], [preds[1]], [preds[2]], [preds[3]], [output_blend_path]
+    # @spaces.GPU #[uncomment to use ZeroGPU]
+    @torch.inference_mode()
+    def process_canny(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        low_threshold: int,
+        high_threshold: int,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        self.preprocessor.load("Canny")
+        control_image = self.preprocessor(
+            image=image, low_threshold=low_threshold, high_threshold=high_threshold, detect_resolution=image_resolution
+        )
+        self.load_controlnet_weight("Canny")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_mlsd(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        preprocess_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        value_threshold: float,
+        distance_threshold: float,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        self.preprocessor.load("MLSD")
+        control_image = self.preprocessor(
+            image=image,
+            image_resolution=image_resolution,
+            detect_resolution=preprocess_resolution,
+            thr_v=value_threshold,
+            thr_d=distance_threshold,
+        )
+        self.load_controlnet_weight("MLSD")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_scribble(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        preprocess_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        preprocessor_name: str,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        if preprocessor_name == "None":
+            image = HWC3(image)
+            image = resize_image(image, resolution=image_resolution)
+            control_image = PIL.Image.fromarray(image)
+        elif preprocessor_name == "HED":
+            self.preprocessor.load(preprocessor_name)
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+                scribble=False,
+            )
+        elif preprocessor_name == "PidiNet":
+            self.preprocessor.load(preprocessor_name)
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+                safe=False,
+            )
+        self.load_controlnet_weight("scribble")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_scribble_interactive(
+        self,
+        image_and_mask: dict[str, np.ndarray | list[np.ndarray]] | None,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+    ) -> list[PIL.Image.Image]:
+        if image_and_mask is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        image = 255 - image_and_mask["composite"]  # type: ignore
+        image = HWC3(image)
+        image = resize_image(image, resolution=image_resolution)
+        control_image = PIL.Image.fromarray(image)
+        self.load_controlnet_weight("scribble")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_softedge(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        preprocess_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        preprocessor_name: str,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        if preprocessor_name == "None":
+            image = HWC3(image)
+            image = resize_image(image, resolution=image_resolution)
+            control_image = PIL.Image.fromarray(image)
+        elif preprocessor_name in ["HED", "HED safe"]:
+            safe = "safe" in preprocessor_name
+            self.preprocessor.load("HED")
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+                scribble=safe,
+            )
+        elif preprocessor_name in ["PidiNet", "PidiNet safe"]:
+            safe = "safe" in preprocessor_name
+            self.preprocessor.load("PidiNet")
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+                safe=safe,
+            )
+        else:
+            raise ValueError
+        self.load_controlnet_weight("softedge")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_openpose(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        preprocess_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        preprocessor_name: str,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        if preprocessor_name == "None":
+            image = HWC3(image)
+            image = resize_image(image, resolution=image_resolution)
+            control_image = PIL.Image.fromarray(image)
+        else:
+            self.preprocessor.load("Openpose")
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+                hand_and_face=True,
+            )
+        self.load_controlnet_weight("Openpose")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_segmentation(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        preprocess_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        preprocessor_name: str,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        if preprocessor_name == "None":
+            image = HWC3(image)
+            image = resize_image(image, resolution=image_resolution)
+            control_image = PIL.Image.fromarray(image)
+        else:
+            self.preprocessor.load(preprocessor_name)
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+            )
+        self.load_controlnet_weight("segmentation")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_depth(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        preprocess_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        preprocessor_name: str,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        if preprocessor_name == "None":
+            image = HWC3(image)
+            image = resize_image(image, resolution=image_resolution)
+            control_image = PIL.Image.fromarray(image)
+        else:
+            self.preprocessor.load(preprocessor_name)
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+            )
+        self.load_controlnet_weight("depth")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_normal(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        preprocess_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        preprocessor_name: str,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        if preprocessor_name == "None":
+            image = HWC3(image)
+            image = resize_image(image, resolution=image_resolution)
+            control_image = PIL.Image.fromarray(image)
+        else:
+            self.preprocessor.load("NormalBae")
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+            )
+        self.load_controlnet_weight("NormalBae")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_lineart(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        preprocess_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        preprocessor_name: str,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        if preprocessor_name in ["None", "None (anime)"]:
+            image = HWC3(image)
+            image = resize_image(image, resolution=image_resolution)
+            control_image = PIL.Image.fromarray(image)
+        elif preprocessor_name in ["Lineart", "Lineart coarse"]:
+            coarse = "coarse" in preprocessor_name
+            self.preprocessor.load("Lineart")
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+                coarse=coarse,
+            )
+        elif preprocessor_name == "Lineart (anime)":
+            self.preprocessor.load("LineartAnime")
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+                detect_resolution=preprocess_resolution,
+            )
+        if "anime" in preprocessor_name:
+            self.load_controlnet_weight("lineart_anime")
+        else:
+            self.load_controlnet_weight("lineart")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_shuffle(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+        preprocessor_name: str,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        if preprocessor_name == "None":
+            image = HWC3(image)
+            image = resize_image(image, resolution=image_resolution)
+            control_image = PIL.Image.fromarray(image)
+        else:
+            self.preprocessor.load(preprocessor_name)
+            control_image = self.preprocessor(
+                image=image,
+                image_resolution=image_resolution,
+            )
+        self.load_controlnet_weight("shuffle")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]
+    @torch.inference_mode()
+    def process_ip2p(
+        self,
+        image: np.ndarray,
+        prompt: str,
+        additional_prompt: str,
+        negative_prompt: str,
+        num_images: int,
+        image_resolution: int,
+        num_steps: int,
+        guidance_scale: float,
+        seed: int,
+    ) -> list[PIL.Image.Image]:
+        if image is None:
+            raise ValueError
+        if image_resolution > MAX_IMAGE_RESOLUTION:
+            raise ValueError
+        if num_images > MAX_NUM_IMAGES:
+            raise ValueError
+        image = HWC3(image)
+        image = resize_image(image, resolution=image_resolution)
+        control_image = PIL.Image.fromarray(image)
+        self.load_controlnet_weight("ip2p")
+        results = self.run_pipe(
+            prompt=self.get_prompt(prompt, additional_prompt),
+            negative_prompt=negative_prompt,
+            control_image=control_image,
+            num_images=num_images,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
+        return [control_image, *results]

preprocessor.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import gc
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Callable
+import numpy as np
+import PIL.Image
+import torch
+from controlnet_aux import (
+    CannyDetector,
+    ContentShuffleDetector,
+    HEDdetector,
+    LineartAnimeDetector,
+    LineartDetector,
+    MidasDetector,
+    MLSDdetector,
+    NormalBaeDetector,
+    OpenposeDetector,
+    PidiNetDetector,
+)
+from controlnet_aux.util import HWC3
+from cv_utils import resize_image
+from depth_estimator import DepthEstimator
+from image_segmentor import ImageSegmentor
+class Preprocessor:
+    MODEL_ID = "lllyasviel/Annotators"
+    def __init__(self) -> None:
+        self.model: Callable = None  # type: ignore
+        self.name = ""
+    def load(self, name: str) -> None:  # noqa: C901, PLR0912
+        if name == self.name:
+            return
+        if name == "HED":
+            self.model = HEDdetector.from_pretrained(self.MODEL_ID)
+        elif name == "Midas":
+            self.model = MidasDetector.from_pretrained(self.MODEL_ID)
+        elif name == "MLSD":
+            self.model = MLSDdetector.from_pretrained(self.MODEL_ID)
+        elif name == "Openpose":
+            self.model = OpenposeDetector.from_pretrained(self.MODEL_ID)
+        elif name == "PidiNet":
+            self.model = PidiNetDetector.from_pretrained(self.MODEL_ID)
+        elif name == "NormalBae":
+            self.model = NormalBaeDetector.from_pretrained(self.MODEL_ID)
+        elif name == "Lineart":
+            self.model = LineartDetector.from_pretrained(self.MODEL_ID)
+        elif name == "LineartAnime":
+            self.model = LineartAnimeDetector.from_pretrained(self.MODEL_ID)
+        elif name == "Canny":
+            self.model = CannyDetector()
+        elif name == "ContentShuffle":
+            self.model = ContentShuffleDetector()
+        elif name == "DPT":
+            self.model = DepthEstimator()
+        elif name == "UPerNet":
+            self.model = ImageSegmentor()
+        elif name == 'texnet':
+            self.model = TexnetPreprocessor()
+        else:
+            raise ValueError
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.name = name
+    def __call__(self, image: PIL.Image.Image, **kwargs) -> PIL.Image.Image:  # noqa: ANN003
+        if self.name == "Canny":
+            if "detect_resolution" in kwargs:
+                detect_resolution = kwargs.pop("detect_resolution")
+                image = np.array(image)
+                image = HWC3(image)
+                image = resize_image(image, resolution=detect_resolution)
+            image = self.model(image, **kwargs)
+            return PIL.Image.fromarray(image)
+        if self.name == "Midas":
+            detect_resolution = kwargs.pop("detect_resolution", 512)
+            image_resolution = kwargs.pop("image_resolution", 512)
+            image = np.array(image)
+            image = HWC3(image)
+            image = resize_image(image, resolution=detect_resolution)
+            image = self.model(image, **kwargs)
+            image = HWC3(image)
+            image = resize_image(image, resolution=image_resolution)
+            return PIL.Image.fromarray(image)
+        return self.model(image, **kwargs)
+# https://github.com/huggingface/controlnet_aux/blob/master/src/controlnet_aux/canny/__init__.py
+class TexnetPreprocessor:
+    def __call__(self, input_image=None, low_threshold=100, high_threshold=200, image_resolution=512, output_type=None, **kwargs):
+        if "img" in kwargs:
+            warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
+            input_image = kwargs.pop("img")
+        if input_image is None:
+            raise ValueError("input_image must be defined.")
+        if not isinstance(input_image, np.ndarray):
+            input_image = np.array(input_image, dtype=np.uint8)
+            output_type = output_type or "pil"
+        else:
+            output_type = output_type or "np"
+        input_image = HWC3(input_image)
+        input_image = resize_image(input_image, image_resolution)
+        H, W, C = input_image.shape
+        # detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        output_image = input_image.copy()
+        if output_type == "pil":
+            # detected_map = Image.fromarray(detected_map)
+            output_image = PIL.Image.fromarray(output_image)
+        return output_image

push_dataset.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from huggingface_hub import HfApi
+api = HfApi()
+api.upload_folder(
+    folder_path="./examples",
+    repo_id="jingyangcarl/matgen",
+    repo_type="space",
+    path_in_repo="examples", # Upload to a specific folder
+)

rgb2x/generate_blend.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import bpy
+import sys
+import os
+def create_tex_node(nodes, img_path, label, color_space, location):
+    img = bpy.data.images.load(img_path)
+    tex = nodes.new(type='ShaderNodeTexImage')
+    tex.image = img
+    tex.label = label
+    tex.location = location
+    tex.image.colorspace_settings.name = color_space
+    return tex
+def setup_environment_lighting(hdri_path):
+    if not bpy.data.worlds:
+        bpy.data.worlds.new(name="World")
+    if bpy.context.scene.world is None:
+        bpy.context.scene.world = bpy.data.worlds[0]
+    world = bpy.context.scene.world
+    world.use_nodes = True
+    nodes = world.node_tree.nodes
+    links = world.node_tree.links
+    nodes.clear()
+    env_tex = nodes.new(type="ShaderNodeTexEnvironment")
+    env_tex.image = bpy.data.images.load(hdri_path)
+    env_tex.location = (-300, 0)
+    bg = nodes.new(type="ShaderNodeBackground")
+    bg.location = (0, 0)
+    output = nodes.new(type="ShaderNodeOutputWorld")
+    output.location = (300, 0)
+    links.new(env_tex.outputs["Color"], bg.inputs["Color"])
+    links.new(bg.outputs["Background"], output.inputs["Surface"])
+def setup_gpu_rendering():
+    bpy.context.scene.render.engine = 'CYCLES'
+    prefs = bpy.context.preferences
+    cprefs = prefs.addons['cycles'].preferences
+    # Choose backend depending on GPU type: 'CUDA', 'OPTIX', 'HIP', 'METAL'
+    cprefs.compute_device_type = 'CUDA'
+    bpy.context.scene.cycles.device = 'GPU'
+def generate_blend(obj_path, base_color_path, normal_map_path, roughness_path, metallic_path, output_blend):
+    # Reset scene
+    bpy.ops.wm.read_factory_settings(use_empty=True)
+    # Import OBJ
+    bpy.ops.import_scene.obj(filepath=obj_path)
+    obj = bpy.context.selected_objects[0]
+    # Create material
+    mat = bpy.data.materials.new(name="BRDF_Material")
+    mat.use_nodes = True
+    nodes = mat.node_tree.nodes
+    links = mat.node_tree.links
+    nodes.clear()
+    output = nodes.new(type='ShaderNodeOutputMaterial')
+    output.location = (400, 0)
+    principled = nodes.new(type='ShaderNodeBsdfPrincipled')
+    principled.location = (100, 0)
+    links.new(principled.outputs['BSDF'], output.inputs['Surface'])
+    # Base Color
+    base_color = create_tex_node(nodes, base_color_path, "Base Color", 'sRGB', (-600, 200))
+    links.new(base_color.outputs['Color'], principled.inputs['Base Color'])
+    # Roughness
+    rough = create_tex_node(nodes, roughness_path, "Roughness", 'Non-Color', (-600, 0))
+    links.new(rough.outputs['Color'], principled.inputs['Roughness'])
+    # Metallic
+    metal = create_tex_node(nodes, metallic_path, "Metallic", 'Non-Color', (-600, -200))
+    links.new(metal.outputs['Color'], principled.inputs['Metallic'])
+    # Normal Map
+    normal_tex = create_tex_node(nodes, normal_map_path, "Normal Map", 'Non-Color', (-800, -400))
+    normal_map = nodes.new(type='ShaderNodeNormalMap')
+    normal_map.location = (-400, -400)
+    links.new(normal_tex.outputs['Color'], normal_map.inputs['Color'])
+    links.new(normal_map.outputs['Normal'], principled.inputs['Normal'])
+    # Assign material
+    if obj.data.materials:
+        obj.data.materials[0] = mat
+    else:
+        obj.data.materials.append(mat)
+    # Global Illumination using Blender's default forest HDRI
+    blender_data_path = bpy.utils.resource_path('LOCAL')
+    forest_hdri_path = os.path.join(blender_data_path, "datafiles", "studiolights", "world", "forest.exr")
+    print(f"Using HDRI: {forest_hdri_path}")
+    setup_environment_lighting(forest_hdri_path)
+    # GPU rendering setup
+    setup_gpu_rendering()
+    # Pack textures into .blend
+    bpy.ops.file.pack_all()
+    # Set the 3D View to Rendered mode and focus on object
+    for area in bpy.context.screen.areas:
+        if area.type == 'VIEW_3D':
+            for space in area.spaces:
+                if space.type == 'VIEW_3D':
+                    space.shading.type = 'RENDERED'  # Set viewport shading to Rendered
+            for region in area.regions:
+                if region.type == 'WINDOW':
+                    override = {'area': area, 'region': region, 'scene': bpy.context.scene}
+                    bpy.ops.view3d.view_all(override, center=True)
+        elif area.type == 'NODE_EDITOR':
+            for space in area.spaces:
+                if space.type == 'NODE_EDITOR':
+                    space.tree_type = 'ShaderNodeTree'  # Switch to Shader Editor
+                    space.shader_type = 'OBJECT'
+    # Optional: Switch active workspace to Shading (if it exists)
+    for workspace in bpy.data.workspaces:
+        if workspace.name == 'Shading':
+            bpy.context.window.workspace = workspace
+            break
+    # Save the .blend file
+    bpy.ops.wm.save_as_mainfile(filepath=output_blend)
+    print(f"✅ Saved .blend file with BRDF, HDRI, GPU: {output_blend}")
+if __name__ == "__main__":
+    argv = sys.argv
+    argv = argv[argv.index("--") + 1:]  # Only use args after "--"
+    if len(argv) != 6:
+        print("Usage:\n  blender --background --python generate_blend.py -- obj base_color normal roughness metallic output.blend")
+        sys.exit(1)
+    generate_blend(*argv)

rgb2x/gradio_demo_rgb2x.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import gradio as gr
+import torch
+import torchvision
+from diffusers import DDIMScheduler
+from load_image import load_exr_image, load_ldr_image
+from pipeline_rgb2x import StableDiffusionAOVMatEstPipeline
+current_directory = os.path.dirname(os.path.abspath(__file__))
+def get_rgb2x_demo():
+    # Load pipeline
+    pipe = StableDiffusionAOVMatEstPipeline.from_pretrained(
+        "zheng95z/rgb-to-x",
+        torch_dtype=torch.float16,
+        cache_dir=os.path.join(current_directory, "model_cache"),
+    ).to("cuda")
+    pipe.scheduler = DDIMScheduler.from_config(
+        pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
+    )
+    pipe.set_progress_bar_config(disable=True)
+    pipe.to("cuda")
+    # Augmentation
+    def callback(
+        photo,
+        seed,
+        inference_step,
+        num_samples,
+    ):
+        generator = torch.Generator(device="cuda").manual_seed(seed)
+        if photo.name.endswith(".exr"):
+            photo = load_exr_image(photo.name, tonemaping=True, clamp=True).to("cuda")
+        elif (
+            photo.name.endswith(".png")
+            or photo.name.endswith(".jpg")
+            or photo.name.endswith(".jpeg")
+        ):
+            photo = load_ldr_image(photo.name, from_srgb=True).to("cuda")
+        # Check if the width and height are multiples of 8. If not, crop it using torchvision.transforms.CenterCrop
+        old_height = photo.shape[1]
+        old_width = photo.shape[2]
+        new_height = old_height
+        new_width = old_width
+        radio = old_height / old_width
+        max_side = 1000
+        if old_height > old_width:
+            new_height = max_side
+            new_width = int(new_height / radio)
+        else:
+            new_width = max_side
+            new_height = int(new_width * radio)
+        if new_width % 8 != 0 or new_height % 8 != 0:
+            new_width = new_width // 8 * 8
+            new_height = new_height // 8 * 8
+        photo = torchvision.transforms.Resize((new_height, new_width))(photo)
+        required_aovs = ["albedo", "normal", "roughness", "metallic", "irradiance"]
+        prompts = {
+            "albedo": "Albedo (diffuse basecolor)",
+            "normal": "Camera-space Normal",
+            "roughness": "Roughness",
+            "metallic": "Metallicness",
+            "irradiance": "Irradiance (diffuse lighting)",
+        }
+        return_list = []
+        for i in range(num_samples):
+            for aov_name in required_aovs:
+                prompt = prompts[aov_name]
+                generated_image = pipe(
+                    prompt=prompt,
+                    photo=photo,
+                    num_inference_steps=inference_step,
+                    height=new_height,
+                    width=new_width,
+                    generator=generator,
+                    required_aovs=[aov_name],
+                ).images[0][0]
+                generated_image = torchvision.transforms.Resize(
+                    (old_height, old_width)
+                )(generated_image)
+                generated_image = (generated_image, f"Generated {aov_name} {i}")
+                return_list.append(generated_image)
+        return return_list
+    block = gr.Blocks()
+    with block:
+        with gr.Row():
+            gr.Markdown("## Model RGB -> X (Realistic image -> Intrinsic channels)")
+        with gr.Row():
+            # Input side
+            with gr.Column():
+                gr.Markdown("### Given Image")
+                photo = gr.File(label="Photo", file_types=[".exr", ".png", ".jpg"])
+                gr.Markdown("### Parameters")
+                run_button = gr.Button(value="Run")
+                with gr.Accordion("Advanced options", open=False):
+                    seed = gr.Slider(
+                        label="Seed",
+                        minimum=-1,
+                        maximum=2147483647,
+                        step=1,
+                        randomize=True,
+                    )
+                    inference_step = gr.Slider(
+                        label="Inference Step",
+                        minimum=1,
+                        maximum=100,
+                        step=1,
+                        value=50,
+                    )
+                    num_samples = gr.Slider(
+                        label="Samples",
+                        minimum=1,
+                        maximum=100,
+                        step=1,
+                        value=1,
+                    )
+            # Output side
+            with gr.Column():
+                gr.Markdown("### Output Gallery")
+                result_gallery = gr.Gallery(
+                    label="Output",
+                    show_label=False,
+                    elem_id="gallery",
+                    columns=2,
+                )
+        inputs = [
+            photo,
+            seed,
+            inference_step,
+            num_samples,
+        ]
+        run_button.click(fn=callback, inputs=inputs, outputs=result_gallery, queue=True)
+    return block
+if __name__ == "__main__":
+    demo = get_rgb2x_demo()
+    demo.queue(max_size=1)
+    demo.launch()

rgb2x/load_image.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+import cv2
+import torch
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import numpy as np
+def convert_rgb_2_XYZ(rgb):
+    # Reference: https://web.archive.org/web/20191027010220/http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
+    # rgb: (h, w, 3)
+    # XYZ: (h, w, 3)
+    XYZ = torch.ones_like(rgb)
+    XYZ[:, :, 0] = (
+        0.4124564 * rgb[:, :, 0] + 0.3575761 * rgb[:, :, 1] + 0.1804375 * rgb[:, :, 2]
+    )
+    XYZ[:, :, 1] = (
+        0.2126729 * rgb[:, :, 0] + 0.7151522 * rgb[:, :, 1] + 0.0721750 * rgb[:, :, 2]
+    )
+    XYZ[:, :, 2] = (
+        0.0193339 * rgb[:, :, 0] + 0.1191920 * rgb[:, :, 1] + 0.9503041 * rgb[:, :, 2]
+    )
+    return XYZ
+def convert_XYZ_2_Yxy(XYZ):
+    # XYZ: (h, w, 3)
+    # Yxy: (h, w, 3)
+    Yxy = torch.ones_like(XYZ)
+    Yxy[:, :, 0] = XYZ[:, :, 1]
+    sum = torch.sum(XYZ, dim=2)
+    inv_sum = 1.0 / torch.clamp(sum, min=1e-4)
+    Yxy[:, :, 1] = XYZ[:, :, 0] * inv_sum
+    Yxy[:, :, 2] = XYZ[:, :, 1] * inv_sum
+    return Yxy
+def convert_rgb_2_Yxy(rgb):
+    # rgb: (h, w, 3)
+    # Yxy: (h, w, 3)
+    return convert_XYZ_2_Yxy(convert_rgb_2_XYZ(rgb))
+def convert_XYZ_2_rgb(XYZ):
+    # XYZ: (h, w, 3)
+    # rgb: (h, w, 3)
+    rgb = torch.ones_like(XYZ)
+    rgb[:, :, 0] = (
+        3.2404542 * XYZ[:, :, 0] - 1.5371385 * XYZ[:, :, 1] - 0.4985314 * XYZ[:, :, 2]
+    )
+    rgb[:, :, 1] = (
+        -0.9692660 * XYZ[:, :, 0] + 1.8760108 * XYZ[:, :, 1] + 0.0415560 * XYZ[:, :, 2]
+    )
+    rgb[:, :, 2] = (
+        0.0556434 * XYZ[:, :, 0] - 0.2040259 * XYZ[:, :, 1] + 1.0572252 * XYZ[:, :, 2]
+    )
+    return rgb
+def convert_Yxy_2_XYZ(Yxy):
+    # Yxy: (h, w, 3)
+    # XYZ: (h, w, 3)
+    XYZ = torch.ones_like(Yxy)
+    XYZ[:, :, 0] = Yxy[:, :, 1] / torch.clamp(Yxy[:, :, 2], min=1e-6) * Yxy[:, :, 0]
+    XYZ[:, :, 1] = Yxy[:, :, 0]
+    XYZ[:, :, 2] = (
+        (1.0 - Yxy[:, :, 1] - Yxy[:, :, 2])
+        / torch.clamp(Yxy[:, :, 2], min=1e-4)
+        * Yxy[:, :, 0]
+    )
+    return XYZ
+def convert_Yxy_2_rgb(Yxy):
+    # Yxy: (h, w, 3)
+    # rgb: (h, w, 3)
+    return convert_XYZ_2_rgb(convert_Yxy_2_XYZ(Yxy))
+def load_ldr_image(image_path, from_srgb=False, clamp=False, normalize=False):
+    # Load png or jpg image
+    image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+    image = torch.from_numpy(image.astype(np.float32) / 255.0)  # (h, w, c)
+    image[~torch.isfinite(image)] = 0
+    if from_srgb:
+        # Convert from sRGB to linear RGB
+        image = image**2.2
+    if clamp:
+        image = torch.clamp(image, min=0.0, max=1.0)
+    if normalize:
+        # Normalize to [-1, 1]
+        image = image * 2.0 - 1.0
+        image = torch.nn.functional.normalize(image, dim=-1, eps=1e-6)
+    return image.permute(2, 0, 1)  # returns (c, h, w)
+def load_exr_image(image_path, tonemaping=False, clamp=False, normalize=False):
+    image = cv2.cvtColor(cv2.imread(image_path, -1), cv2.COLOR_BGR2RGB)
+    image = torch.from_numpy(image.astype("float32"))  # (h, w, c)
+    image[~torch.isfinite(image)] = 0
+    if tonemaping:
+        # Exposure adjuestment
+        image_Yxy = convert_rgb_2_Yxy(image)
+        lum = (
+            image[:, :, 0:1] * 0.2125
+            + image[:, :, 1:2] * 0.7154
+            + image[:, :, 2:3] * 0.0721
+        )
+        lum = torch.log(torch.clamp(lum, min=1e-6))
+        lum_mean = torch.exp(torch.mean(lum))
+        lp = image_Yxy[:, :, 0:1] * 0.18 / torch.clamp(lum_mean, min=1e-6)
+        image_Yxy[:, :, 0:1] = lp
+        image = convert_Yxy_2_rgb(image_Yxy)
+    if clamp:
+        image = torch.clamp(image, min=0.0, max=1.0)
+    if normalize:
+        image = torch.nn.functional.normalize(image, dim=-1, eps=1e-6)
+    return image.permute(2, 0, 1)  # returns (c, h, w)

rgb2x/pipeline_rgb2x.py ADDED Viewed

	@@ -0,0 +1,821 @@

+import inspect
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+import numpy as np
+import PIL
+import torch
+from diffusers.configuration_utils import register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import (
+    LoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    rescale_noise_cfg,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    CONFIG_NAME,
+    BaseOutput,
+    deprecate,
+    logging,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import CLIPTextModel, CLIPTokenizer
+logger = logging.get_logger(__name__)
+class VaeImageProcrssorAOV(VaeImageProcessor):
+    """
+    Image processor for VAE AOV.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+    """
+    config_name = CONFIG_NAME
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+    ):
+        super().__init__()
+    def postprocess(
+        self,
+        image: torch.FloatTensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+        do_gamma_correction: bool = True,
+    ):
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+            )
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate(
+                "Unsupported output_type",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            output_type = "np"
+        if output_type == "latent":
+            return image
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+        image = torch.stack(
+            [
+                self.denormalize(image[i]) if do_denormalize[i] else image[i]
+                for i in range(image.shape[0])
+            ]
+        )
+        # Gamma correction
+        if do_gamma_correction:
+            image = torch.pow(image, 1.0 / 2.2)
+        if output_type == "pt":
+            return image
+        image = self.pt_to_numpy(image)
+        if output_type == "np":
+            return image
+        if output_type == "pil":
+            return self.numpy_to_pil(image)
+    def preprocess_normal(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> torch.Tensor:
+        image = torch.stack([image], axis=0)
+        return image
+@dataclass
+class StableDiffusionAOVPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion AOV pipelines.
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+    images: Union[List[PIL.Image.Image], np.ndarray]
+class StableDiffusionAOVMatEstPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin
+):
+    r"""
+    Pipeline for AOVs.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcrssorAOV(
+            vae_scale_factor=self.vae_scale_factor
+        )
+        self.register_to_config()
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_ prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt"
+            ).input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                -1
+            ] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if (
+                hasattr(self.text_encoder.config, "use_attention_mask")
+                and self.text_encoder.config.use_attention_mask
+            ):
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if (
+                hasattr(self.text_encoder.config, "use_attention_mask")
+                and self.text_encoder.config.use_attention_mask
+            ):
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device
+            )
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            # pix2pix has two  negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
+            prompt_embeds = torch.cat(
+                [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
+            )
+        return prompt_embeds
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def prepare_image_latents(
+        self,
+        image,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        do_classifier_free_guidance,
+        generator=None,
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        image = image.to(device=device, dtype=dtype)
+        batch_size = batch_size * num_images_per_prompt
+        if image.shape[1] == 4:
+            image_latents = image
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            if isinstance(generator, list):
+                image_latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.mode()
+                    for i in range(batch_size)
+                ]
+                image_latents = torch.cat(image_latents, dim=0)
+            else:
+                image_latents = self.vae.encode(image).latent_dist.mode()
+        if (
+            batch_size > image_latents.shape[0]
+            and batch_size % image_latents.shape[0] == 0
+        ):
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate(
+                "len(prompt) != len(image)",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat(
+                [image_latents] * additional_image_per_prompt, dim=0
+            )
+        elif (
+            batch_size > image_latents.shape[0]
+            and batch_size % image_latents.shape[0] != 0
+        ):
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+        if do_classifier_free_guidance:
+            uncond_image_latents = torch.zeros_like(image_latents)
+            image_latents = torch.cat(
+                [image_latents, image_latents, uncond_image_latents], dim=0
+            )
+        return image_latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        photo: Union[
+            torch.FloatTensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.FloatTensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 100,
+        required_aovs: List[str] = ["albedo"],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        use_default_scaling_factor: Optional[bool] = False,
+        guidance_scale: float = 0.0,
+        image_guidance_scale: float = 0.0,
+        guidance_rescale: float = 0.0,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
+                image latents as `image`, but if passing latents directly it is not encoded again.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            image_guidance_scale (`float`, *optional*, defaults to 1.5):
+                Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
+                `image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
+                linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
+                value of at least `1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+        Examples:
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+        >>> from diffusers import StableDiffusionInstructPix2PixPipeline
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+        >>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
+        >>> image = download_image(img_url).resize((512, 512))
+        >>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+        ...     "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "make the mountains snowy"
+        >>> image = pipe(prompt=prompt, image=image).images[0]
+        ```
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Check inputs
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = (
+            guidance_scale > 1.0 and image_guidance_scale >= 1.0
+        )
+        # check if scheduler is in sigmas space
+        scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
+        # 2. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 3. Preprocess image
+        # Normalize image to [-1,1]
+        preprocessed_photo = self.image_processor.preprocess(photo)
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare Image latents
+        image_latents = self.prepare_image_latents(
+            preprocessed_photo,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            do_classifier_free_guidance,
+            generator,
+        )
+        image_latents = image_latents * self.vae.config.scaling_factor
+        height, width = image_latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Check that shapes of latents and image match the UNet channels
+        num_channels_image = image_latents.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Expand the latents if we are doing classifier free guidance.
+                # The latents are expanded 3 times because for pix2pix the guidance\
+                # is applied for both the text and the input image.
+                latent_model_input = (
+                    torch.cat([latents] * 3) if do_classifier_free_guidance else latents
+                )
+                # concat latents, image_latents in the channel dimension
+                scaled_latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                scaled_latent_model_input = torch.cat(
+                    [scaled_latent_model_input, image_latents], dim=1
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    scaled_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if do_classifier_free_guidance:
+                    (
+                        noise_pred_text,
+                        noise_pred_image,
+                        noise_pred_uncond,
+                    ) = noise_pred.chunk(3)
+                    noise_pred = (
+                        noise_pred_uncond
+                        + guidance_scale * (noise_pred_text - noise_pred_image)
+                        + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
+                    )
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+            aov_latents = latents / self.vae.config.scaling_factor
+            aov = self.vae.decode(aov_latents, return_dict=False)[0]
+            do_denormalize = [True] * aov.shape[0]
+            aov_name = required_aovs[0]
+            if aov_name == "albedo" or aov_name == "irradiance":
+                do_gamma_correction = True
+            else:
+                do_gamma_correction = False
+            if aov_name == "roughness" or aov_name == "metallic":
+                aov = aov[:, 0:1].repeat(1, 3, 1, 1)
+            aov = self.image_processor.postprocess(
+                aov,
+                output_type=output_type,
+                do_denormalize=do_denormalize,
+                do_gamma_correction=do_gamma_correction,
+            )
+            aovs = [aov]
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        return StableDiffusionAOVPipelineOutput(images=aovs)

run.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/bin/bash
+CONDA_ENV=$(head -1 /code/environment.yml | cut -d" " -f2)
+eval "$(conda shell.bash hook)"
+conda activate gradio-user
+export OMP_NUM_THREADS=4 # default is a wrong value: 7500m
+conda install -n gradio-user pytorch3d=0.7.7 -c pytorch3d -c conda-forge
+conda install -n gradio-user -c conda-forge open-clip-torch pytorch-lightning
+# Start app.py
+echo "Starting app.py..."
+python -c "import torch; x=torch.rand(1, device='cuda'); print(x, x.device.type)"
+python app.py

settings.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import numpy as np
+DEFAULT_MODEL_ID = os.getenv("DEFAULT_MODEL_ID", "stable-diffusion-v1-5/stable-diffusion-v1-5")
+MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "3"))
+DEFAULT_NUM_IMAGES = min(MAX_NUM_IMAGES, int(os.getenv("DEFAULT_NUM_IMAGES", "1")))
+MAX_IMAGE_RESOLUTION = int(os.getenv("MAX_IMAGE_RESOLUTION", "2048"))
+DEFAULT_IMAGE_RESOLUTION = min(MAX_IMAGE_RESOLUTION, int(os.getenv("DEFAULT_IMAGE_RESOLUTION", "1024")))
+ALLOW_CHANGING_BASE_MODEL = os.getenv("SPACE_ID") != "hysts/ControlNet-v1-1"
+SHOW_DUPLICATE_BUTTON = os.getenv("SHOW_DUPLICATE_BUTTON") == "1"
+MAX_SEED = np.iinfo(np.int32).max
+# Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+# setup CUDA
+# disable the following when deployting to hugging face
+# if os.getenv("CUDA_VISIBLE_DEVICES") is None:
+#     os.environ["CUDA_VISIBLE_DEVICES"] = "7"
+#     os.environ["GRADIO_SERVER_PORT"] = "7864"

text2tex/lib/__init__.py ADDED Viewed

File without changes

text2tex/lib/camera_helper.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import torch
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from pytorch3d.renderer import (
+    PerspectiveCameras,
+    look_at_view_transform
+)
+# customized
+import sys
+sys.path.append(".")
+from lib.constants import VIEWPOINTS
+# ---------------- UTILS ----------------------
+def degree_to_radian(d):
+    return d * np.pi / 180
+def radian_to_degree(r):
+    return 180 * r / np.pi
+def xyz_to_polar(xyz):
+    """ assume y-axis is the up axis """
+    x, y, z = xyz
+    theta = 180 * np.arccos(z) / np.pi
+    phi = 180 * np.arccos(y) / np.pi
+    return theta, phi
+def polar_to_xyz(theta, phi, dist):
+    """ assume y-axis is the up axis """
+    theta = degree_to_radian(theta)
+    phi = degree_to_radian(phi)
+    x = np.sin(phi) * np.sin(theta) * dist
+    y = np.cos(phi) * dist
+    z = np.sin(phi) * np.cos(theta) * dist
+    return [x, y, z]
+# ---------------- VIEWPOINTS ----------------------
+def filter_viewpoints(pre_viewpoints: dict, viewpoints: dict):
+    """ return the binary mask of viewpoints to be filtered """
+    filter_mask = [0 for _ in viewpoints.keys()]
+    for i, v in viewpoints.items():
+        x_v, y_v, z_v = polar_to_xyz(v["azim"], 90 - v["elev"], v["dist"])
+        for _, pv in pre_viewpoints.items():
+            x_pv, y_pv, z_pv = polar_to_xyz(pv["azim"], 90 - pv["elev"], pv["dist"])
+            sim = cosine_similarity(
+                np.array([[x_v, y_v, z_v]]),
+                np.array([[x_pv, y_pv, z_pv]])
+            )[0, 0]
+            if sim > 0.9:
+                filter_mask[i] = 1
+    return filter_mask
+def init_viewpoints(mode, sample_space, init_dist, init_elev, principle_directions,
+    use_principle=True, use_shapenet=False, use_objaverse=False):
+    if mode == "predefined":
+        (
+            dist_list,
+            elev_list,
+            azim_list,
+            sector_list
+        ) = init_predefined_viewpoints(sample_space, init_dist, init_elev)
+    elif mode == "hemisphere":
+        (
+            dist_list,
+            elev_list,
+            azim_list,
+            sector_list
+        ) = init_hemisphere_viewpoints(sample_space, init_dist)
+    else:
+        raise NotImplementedError()
+    # punishments for views -> in case always selecting the same view
+    view_punishments = [1 for _ in range(len(dist_list))]
+    if use_principle:
+        (
+            dist_list,
+            elev_list,
+            azim_list,
+            sector_list,
+            view_punishments
+        ) = init_principle_viewpoints(
+            principle_directions,
+            dist_list,
+            elev_list,
+            azim_list,
+            sector_list,
+            view_punishments,
+            use_shapenet,
+            use_objaverse
+        )
+    return dist_list, elev_list, azim_list, sector_list, view_punishments
+def init_principle_viewpoints(
+    principle_directions,
+    dist_list,
+    elev_list,
+    azim_list,
+    sector_list,
+    view_punishments,
+    use_shapenet=False,
+    use_objaverse=False
+):
+    if use_shapenet:
+        key = "shapenet"
+        pre_elev_list = [v for v in VIEWPOINTS[key]["elev"]]
+        pre_azim_list = [v for v in VIEWPOINTS[key]["azim"]]
+        pre_sector_list = [v for v in VIEWPOINTS[key]["sector"]]
+        num_principle = 10
+        pre_dist_list = [dist_list[0] for _ in range(num_principle)]
+        pre_view_punishments = [0 for _ in range(num_principle)]
+    elif use_objaverse:
+        key = "objaverse"
+        pre_elev_list = [v for v in VIEWPOINTS[key]["elev"]]
+        pre_azim_list = [v for v in VIEWPOINTS[key]["azim"]]
+        pre_sector_list = [v for v in VIEWPOINTS[key]["sector"]]
+        num_principle = 10
+        pre_dist_list = [dist_list[0] for _ in range(num_principle)]
+        pre_view_punishments = [0 for _ in range(num_principle)]
+    else:
+        num_principle = 6
+        pre_elev_list = [v for v in VIEWPOINTS[num_principle]["elev"]]
+        pre_azim_list = [v for v in VIEWPOINTS[num_principle]["azim"]]
+        pre_sector_list = [v for v in VIEWPOINTS[num_principle]["sector"]]
+        pre_dist_list = [dist_list[0] for _ in range(num_principle)]
+        pre_view_punishments = [0 for _ in range(num_principle)]
+    dist_list = pre_dist_list + dist_list
+    elev_list = pre_elev_list + elev_list
+    azim_list = pre_azim_list + azim_list
+    sector_list = pre_sector_list + sector_list
+    view_punishments = pre_view_punishments + view_punishments
+    return dist_list, elev_list, azim_list, sector_list, view_punishments
+def init_predefined_viewpoints(sample_space, init_dist, init_elev):
+    viewpoints = VIEWPOINTS[sample_space]
+    assert sample_space == len(viewpoints["sector"])
+    dist_list = [init_dist for _ in range(sample_space)] # always the same dist
+    elev_list = [viewpoints["elev"][i] for i in range(sample_space)]
+    azim_list = [viewpoints["azim"][i] for i in range(sample_space)]
+    sector_list = [viewpoints["sector"][i] for i in range(sample_space)]
+    return dist_list, elev_list, azim_list, sector_list
+def init_hemisphere_viewpoints(sample_space, init_dist):
+    """
+        y is up-axis
+    """
+    num_points = 2 * sample_space
+    ga = np.pi * (3. - np.sqrt(5.))  # golden angle in radians
+    flags = []
+    elev_list = [] # degree
+    azim_list = [] # degree
+    for i in range(num_points):
+        y = 1 - (i / float(num_points - 1)) * 2  # y goes from 1 to -1
+        # only take the north hemisphere
+        if y >= 0:
+            flags.append(True)
+        else:
+            flags.append(False)
+        theta = ga * i  # golden angle increment
+        elev_list.append(radian_to_degree(np.arcsin(y)))
+        azim_list.append(radian_to_degree(theta))
+        radius = np.sqrt(1 - y * y)  # radius at y
+        x = np.cos(theta) * radius
+        z = np.sin(theta) * radius
+    elev_list = [elev_list[i] for i in range(len(elev_list)) if flags[i]]
+    azim_list = [azim_list[i] for i in range(len(azim_list)) if flags[i]]
+    dist_list = [init_dist for _ in elev_list]
+    sector_list = ["good" for _ in elev_list] # HACK don't define sector names for now
+    return dist_list, elev_list, azim_list, sector_list
+# ---------------- CAMERAS ----------------------
+def init_camera(dist, elev, azim, image_size, device):
+    R, T = look_at_view_transform(dist, elev, azim)
+    image_size = torch.tensor([image_size, image_size]).unsqueeze(0)
+    cameras = PerspectiveCameras(R=R, T=T, device=device, image_size=image_size)
+    return cameras

text2tex/lib/constants.py ADDED Viewed

	@@ -0,0 +1,648 @@

+PALETTE = {
+    0: [255, 255, 255], # white  -  background
+    1: [204, 50, 50],   # red    -  old
+    2: [231, 180, 22],  # yellow -  update
+    3: [45, 201, 55]    # green  -  new
+}
+QUAD_WEIGHTS = {
+    0: 0, # background
+    1: 0.1,   # old
+    2: 0.5,  # update
+    3: 1    # new
+}
+VIEWPOINTS = {
+    1: {
+        "azim": [
+            0
+        ],
+        "elev": [
+            0
+        ],
+        "sector": [
+            "front"
+        ]
+    },
+    2: {
+        "azim": [
+            0,
+            30
+        ],
+        "elev": [
+            0,
+            0
+        ],
+        "sector": [
+            "front",
+            "front"
+        ]
+    },
+    4: {
+        "azim": [
+            45,
+            315,
+            135,
+            225,
+        ],
+        "elev": [
+            0,
+            0,
+            0,
+            0,
+        ],
+        "sector": [
+            "front right",
+            "front left",
+            "back right",
+            "back left",
+        ]
+    },
+    6: {
+        "azim": [
+            0,
+            90,
+            270,
+            0,
+            180,
+            0
+        ],
+        "elev": [
+            0,
+            0,
+            0,
+            90,
+            0,
+            -90
+        ],
+        "sector": [
+            "front",
+            "right",
+            "left",
+            "top",
+            "back",
+            "bottom",
+        ]
+    },
+    "shapenet": {
+        "azim": [
+            270,
+            315,
+            225,
+            0,
+            180,
+            45,
+            135,
+            90,
+            270,
+            270
+        ],
+        "elev": [
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            90,
+            -90
+        ],
+        "sector": [
+            "front",
+            "front right",
+            "front left",
+            "right",
+            "left",
+            "back right",
+            "back left",
+            "back",
+            "top",
+            "bottom",
+        ]
+    },
+    "objaverse": {
+        "azim": [
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+            0,
+            0
+        ],
+        "elev": [
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            90,
+            -90
+        ],
+        "sector": [
+            "front",
+            "front right",
+            "front left",
+            "right",
+            "left",
+            "back right",
+            "back left",
+            "back",
+            "top",
+            "bottom",
+        ]
+    },
+    12: {
+        "azim": [
+            45,
+            315,
+            135,
+            225,
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+        ],
+        "elev": [
+            0,
+            0,
+            0,
+            0,
+            45,
+            45,
+            45,
+            45,
+            45,
+            45,
+            45,
+            45,
+        ],
+        "sector": [
+            "front right",
+            "front left",
+            "back right",
+            "back left",
+            "front",
+            "front right",
+            "front left",
+            "right",
+            "left",
+            "back right",
+            "back left",
+            "back",
+        ]
+    },
+    20: {
+        "azim": [
+            45,
+            315,
+            135,
+            225,
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+        ],
+        "elev": [
+            0,
+            0,
+            0,
+            0,
+            30,
+            30,
+            30,
+            30,
+            30,
+            30,
+            30,
+            30,
+            60,
+            60,
+            60,
+            60,
+            60,
+            60,
+            60,
+            60,
+        ],
+        "sector": [
+            "front right",
+            "front left",
+            "back right",
+            "back left",
+            "front",
+            "front right",
+            "front left",
+            "right",
+            "left",
+            "back right",
+            "back left",
+            "back",
+            "front",
+            "front right",
+            "front left",
+            "right",
+            "left",
+            "back right",
+            "back left",
+            "back",
+        ]
+    },
+    36: {
+        "azim": [
+            45,
+            315,
+            135,
+            225,
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+            22.5,
+            337.5,
+            67.5,
+            292.5,
+            112.5,
+            247.5,
+            157.5,
+            202.5,
+            22.5,
+            337.5,
+            67.5,
+            292.5,
+            112.5,
+            247.5,
+            157.5,
+            202.5,
+        ],
+        "elev": [
+            0,
+            0,
+            0,
+            0,
+            30,
+            30,
+            30,
+            30,
+            30,
+            30,
+            30,
+            30,
+            60,
+            60,
+            60,
+            60,
+            60,
+            60,
+            60,
+            60,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            45,
+            45,
+            45,
+            45,
+            45,
+            45,
+            45,
+            45,
+        ],
+        "sector": [
+            "front right",
+            "front left",
+            "back right",
+            "back left",
+            "front",
+            "front right",
+            "front left",
+            "right",
+            "left",
+            "back right",
+            "back left",
+            "back",
+            "top front",
+            "top right",
+            "top left",
+            "top right",
+            "top left",
+            "top right",
+            "top left",
+            "top back",
+            "front right",
+            "front left",
+            "front right",
+            "front left",
+            "back right",
+            "back left",
+            "back right",
+            "back left",
+            "front right",
+            "front left",
+            "front right",
+            "front left",
+            "back right",
+            "back left",
+            "back right",
+            "back left",
+        ]
+    },
+    68: {
+        "azim": [
+            45,
+            315,
+            135,
+            225,
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+            22.5,
+            337.5,
+            67.5,
+            292.5,
+            112.5,
+            247.5,
+            157.5,
+            202.5,
+            22.5,
+            337.5,
+            67.5,
+            292.5,
+            112.5,
+            247.5,
+            157.5,
+            202.5,
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+            0,
+            45,
+            315,
+            90,
+            270,
+            135,
+            225,
+            180,
+            22.5,
+            337.5,
+            67.5,
+            292.5,
+            112.5,
+            247.5,
+            157.5,
+            202.5,
+            22.5,
+            337.5,
+            67.5,
+            292.5,
+            112.5,
+            247.5,
+            157.5,
+            202.5
+        ],
+        "elev": [
+            0,
+            0,
+            0,
+            0,
+            30,
+            30,
+            30,
+            30,
+            30,
+            30,
+            30,
+            30,
+            60,
+            60,
+            60,
+            60,
+            60,
+            60,
+            60,
+            60,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            15,
+            45,
+            45,
+            45,
+            45,
+            45,
+            45,
+            45,
+            45,
+            -30,
+            -30,
+            -30,
+            -30,
+            -30,
+            -30,
+            -30,
+            -30,
+            -60,
+            -60,
+            -60,
+            -60,
+            -60,
+            -60,
+            -60,
+            -60,
+            -15,
+            -15,
+            -15,
+            -15,
+            -15,
+            -15,
+            -15,
+            -15,
+            -45,
+            -45,
+            -45,
+            -45,
+            -45,
+            -45,
+            -45,
+            -45,
+        ],
+        "sector": [
+            "front right",
+            "front left",
+            "back right",
+            "back left",
+            "front",
+            "front right",
+            "front left",
+            "right",
+            "left",
+            "back right",
+            "back left",
+            "back",
+            "top front",
+            "top right",
+            "top left",
+            "top right",
+            "top left",
+            "top right",
+            "top left",
+            "top back",
+            "front right",
+            "front left",
+            "front right",
+            "front left",
+            "back right",
+            "back left",
+            "back right",
+            "back left",
+            "front right",
+            "front left",
+            "front right",
+            "front left",
+            "back right",
+            "back left",
+            "back right",
+            "back left",
+            "front",
+            "front right",
+            "front left",
+            "right",
+            "left",
+            "back right",
+            "back left",
+            "back",
+            "bottom front",
+            "bottom right",
+            "bottom left",
+            "bottom right",
+            "bottom left",
+            "bottom right",
+            "bottom left",
+            "bottom back",
+            "bottom front right",
+            "bottom front left",
+            "bottom front right",
+            "bottom front left",
+            "bottom back right",
+            "bottom back left",
+            "bottom back right",
+            "bottom back left",
+            "bottom front right",
+            "bottom front left",
+            "bottom front right",
+            "bottom front left",
+            "bottom back right",
+            "bottom back left",
+            "bottom back right",
+            "bottom back left",
+        ]
+    }
+}

text2tex/lib/diffusion_helper.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import torch
+import cv2
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+# Stable Diffusion 2
+from diffusers import (
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+    EulerDiscreteScheduler
+)
+# customized
+import sys
+sys.path.append(".")
+from models.ControlNet.gradio_depth2image import init_model, process
+def get_controlnet_depth():
+    print("=> initializing ControlNet Depth...")
+    model, ddim_sampler = init_model()
+    return model, ddim_sampler
+def get_inpainting(device):
+    print("=> initializing Inpainting...")
+    model = StableDiffusionInpaintPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-inpainting",
+        torch_dtype=torch.float16,
+    ).to(device)
+    return model
+def get_text2image(device):
+    print("=> initializing Inpainting...")
+    model_id = "stabilityai/stable-diffusion-2"
+    scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
+    model = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16).to(device)
+    return model
+@torch.no_grad()
+def apply_controlnet_depth(model, ddim_sampler,
+    init_image, prompt, strength, ddim_steps,
+    generate_mask_image, keep_mask_image, depth_map_np,
+    a_prompt, n_prompt, guidance_scale, seed, eta, num_samples,
+    device, blend=0, save_memory=False):
+    """
+        Use Stable Diffusion 2 to generate image
+        Arguments:
+            args: input arguments
+            model: Stable Diffusion 2 model
+            init_image_tensor: input image, torch.FloatTensor of shape (1, H, W, 3)
+            mask_tensor: depth map of the input image, torch.FloatTensor of shape (1, H, W, 1)
+            depth_map_np: depth map of the input image, torch.FloatTensor of shape (1, H, W)
+    """
+    print("=> generating ControlNet Depth RePaint image...")
+    # Stable Diffusion 2 receives PIL.Image
+    # NOTE Stable Diffusion 2 returns a PIL.Image object
+    # image and mask_image should be PIL images.
+    # The mask structure is white for inpainting and black for keeping as is
+    diffused_image_np = process(
+        model, ddim_sampler,
+        np.array(init_image), prompt, a_prompt, n_prompt, num_samples,
+        ddim_steps, guidance_scale, seed, eta,
+        strength=strength, detected_map=depth_map_np, unknown_mask=np.array(generate_mask_image), save_memory=save_memory
+    )[0]
+    init_image = init_image.convert("RGB")
+    diffused_image = Image.fromarray(diffused_image_np).convert("RGB")
+    if blend > 0 and transforms.ToTensor()(keep_mask_image).sum() > 0:
+        print("=> blending the generated region...")
+        kernel_size = 3
+        kernel = np.ones((kernel_size, kernel_size), np.uint8)
+        keep_image_np = np.array(init_image).astype(np.uint8)
+        keep_image_np_dilate = cv2.dilate(keep_image_np, kernel, iterations=1)
+        keep_mask_np = np.array(keep_mask_image).astype(np.uint8)
+        keep_mask_np_dilate = cv2.dilate(keep_mask_np, kernel, iterations=1)
+        generate_image_np = np.array(diffused_image).astype(np.uint8)
+        overlap_mask_np = np.array(generate_mask_image).astype(np.uint8)
+        overlap_mask_np *= keep_mask_np_dilate
+        print("=> blending {} pixels...".format(np.sum(overlap_mask_np)))
+        overlap_keep = keep_image_np_dilate[overlap_mask_np == 1]
+        overlap_generate = generate_image_np[overlap_mask_np == 1]
+        overlap_np = overlap_keep * blend + overlap_generate * (1 - blend)
+        generate_image_np[overlap_mask_np == 1] = overlap_np
+        diffused_image = Image.fromarray(generate_image_np.astype(np.uint8)).convert("RGB")
+    init_image_masked = init_image
+    diffused_image_masked = diffused_image
+    return diffused_image, init_image_masked, diffused_image_masked
+@torch.no_grad()
+def apply_inpainting(model,
+    init_image, mask_image_tensor, prompt, height, width, device):
+    """
+        Use Stable Diffusion 2 to generate image
+        Arguments:
+            args: input arguments
+            model: Stable Diffusion 2 model
+            init_image_tensor: input image, torch.FloatTensor of shape (1, H, W, 3)
+            mask_tensor: depth map of the input image, torch.FloatTensor of shape (1, H, W, 1)
+            depth_map_tensor: depth map of the input image, torch.FloatTensor of shape (1, H, W)
+    """
+    print("=> generating Inpainting image...")
+    mask_image = mask_image_tensor[0].cpu()
+    mask_image = mask_image.permute(2, 0, 1)
+    mask_image = transforms.ToPILImage()(mask_image).convert("L")
+    # NOTE Stable Diffusion 2 returns a PIL.Image object
+    # image and mask_image should be PIL images.
+    # The mask structure is white for inpainting and black for keeping as is
+    diffused_image = model(
+        prompt=prompt,
+        image=init_image.resize((512, 512)),
+        mask_image=mask_image.resize((512, 512)),
+        height=512,
+        width=512
+    ).images[0].resize((height, width))
+    return diffused_image
+@torch.no_grad()
+def apply_inpainting_postprocess(model,
+    init_image, mask_image_tensor, prompt, height, width, device):
+    """
+        Use Stable Diffusion 2 to generate image
+        Arguments:
+            args: input arguments
+            model: Stable Diffusion 2 model
+            init_image_tensor: input image, torch.FloatTensor of shape (1, H, W, 3)
+            mask_tensor: depth map of the input image, torch.FloatTensor of shape (1, H, W, 1)
+            depth_map_tensor: depth map of the input image, torch.FloatTensor of shape (1, H, W)
+    """
+    print("=> generating Inpainting image...")
+    mask_image = mask_image_tensor[0].cpu()
+    mask_image = mask_image.permute(2, 0, 1)
+    mask_image = transforms.ToPILImage()(mask_image).convert("L")
+    # NOTE Stable Diffusion 2 returns a PIL.Image object
+    # image and mask_image should be PIL images.
+    # The mask structure is white for inpainting and black for keeping as is
+    diffused_image = model(
+        prompt=prompt,
+        image=init_image.resize((512, 512)),
+        mask_image=mask_image.resize((512, 512)),
+        height=512,
+        width=512
+    ).images[0].resize((height, width))
+    diffused_image_tensor = torch.from_numpy(np.array(diffused_image)).to(device)
+    init_images_tensor = torch.from_numpy(np.array(init_image)).to(device)
+    init_images_tensor = diffused_image_tensor * mask_image_tensor[0] + init_images_tensor * (1 - mask_image_tensor[0])
+    init_image = Image.fromarray(init_images_tensor.cpu().numpy().astype(np.uint8)).convert("RGB")
+    return init_image

text2tex/lib/io_helper.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# common utils
+import os
+import json
+# numpy
+import numpy as np
+# visualization
+import matplotlib
+import matplotlib.cm as cm
+import matplotlib.pyplot as plt
+matplotlib.use("Agg")
+from pytorch3d.io import save_obj
+from torchvision import transforms
+def save_depth(fragments, output_dir, init_image, view_idx):
+    print("=> saving depth...")
+    width, height = init_image.size
+    dpi = 100
+    figsize = width / float(dpi), height / float(dpi)
+    depth_np = fragments.zbuf[0].cpu().numpy()
+    fig = plt.figure(figsize=figsize)
+    ax = fig.add_axes([0, 0, 1, 1])
+    # Hide spines, ticks, etc.
+    ax.axis('off')
+    # Display the image.
+    ax.imshow(depth_np, cmap='gray')
+    plt.savefig(os.path.join(output_dir, "{}.png".format(view_idx)), bbox_inches='tight', pad_inches=0)
+    np.save(os.path.join(output_dir, "{}.npy".format(view_idx)), depth_np[..., 0])
+def save_backproject_obj(output_dir, obj_name,
+    verts, faces, verts_uvs, faces_uvs, projected_texture,
+    device):
+    print("=> saving OBJ file...")
+    texture_map = transforms.ToTensor()(projected_texture).to(device)
+    texture_map = texture_map.permute(1, 2, 0)
+    obj_path = os.path.join(output_dir, obj_name)
+    save_obj(
+        obj_path,
+        verts=verts,
+        faces=faces,
+        decimal_places=5,
+        verts_uvs=verts_uvs,
+        faces_uvs=faces_uvs,
+        texture_map=texture_map
+    )
+def save_args(args, output_dir):
+    with open(os.path.join(output_dir, "args.json"), "w") as f:
+        json.dump(
+            {k: v for k, v in vars(args).items()},
+            f,
+            indent=4
+        )
+def save_viewpoints(args, output_dir, dist_list, elev_list, azim_list, view_list):
+    with open(os.path.join(output_dir, "viewpoints.json"), "w") as f:
+        json.dump(
+            {
+                "dist": dist_list,
+                "elev": elev_list,
+                "azim": azim_list,
+                "view": view_list
+            },
+            f,
+            indent=4
+        )

text2tex/lib/mesh_helper.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import torch
+import trimesh
+import xatlas
+import numpy as np
+from sklearn.decomposition import PCA
+from torchvision import transforms
+from tqdm import tqdm
+from pytorch3d.io import (
+    load_obj,
+    load_objs_as_meshes
+)
+def compute_principle_directions(model_path, num_points=20000):
+    mesh = trimesh.load_mesh(model_path, force="mesh")
+    pc, _ = trimesh.sample.sample_surface_even(mesh, num_points)
+    pc -= np.mean(pc, axis=0, keepdims=True)
+    principle_directions = PCA(n_components=3).fit(pc).components_
+    return principle_directions
+def init_mesh(input_path, cache_path, device):
+    print("=> parameterizing target mesh...")
+    mesh = trimesh.load_mesh(input_path, force='mesh')
+    try:
+        vertices, faces = mesh.vertices, mesh.faces
+    except AttributeError:
+        print("multiple materials in {} are not supported".format(input_path))
+        exit()
+    vmapping, indices, uvs = xatlas.parametrize(vertices, faces)
+    xatlas.export(str(cache_path), vertices[vmapping], indices, uvs)
+    print("=> loading target mesh...")
+    # principle_directions = compute_principle_directions(cache_path)
+    principle_directions = None
+    _, faces, aux = load_obj(cache_path, device=device)
+    mesh = load_objs_as_meshes([cache_path], device=device)
+    num_verts = mesh.verts_packed().shape[0]
+    # make sure mesh center is at origin
+    bbox = mesh.get_bounding_boxes()
+    mesh_center = bbox.mean(dim=2).repeat(num_verts, 1)
+    mesh = apply_offsets_to_mesh(mesh, -mesh_center)
+    # make sure mesh size is normalized
+    box_size = bbox[..., 1] - bbox[..., 0]
+    box_max = box_size.max(dim=1, keepdim=True)[0].repeat(num_verts, 3)
+    mesh = apply_scale_to_mesh(mesh, 1 / box_max)
+    return mesh, mesh.verts_packed(), faces, aux, principle_directions, mesh_center, box_max
+def apply_offsets_to_mesh(mesh, offsets):
+    new_mesh = mesh.offset_verts(offsets)
+    return new_mesh
+def apply_scale_to_mesh(mesh, scale):
+    new_mesh = mesh.scale_verts(scale)
+    return new_mesh
+def adjust_uv_map(faces, aux, init_texture, uv_size):
+    """
+        adjust UV map to be compatiable with multiple textures.
+        UVs for different materials will be decomposed and placed horizontally
+        +-----+-----+-----+--
+        |  1  |  2  |  3  |
+        +-----+-----+-----+--
+    """
+    textures_ids = faces.textures_idx
+    materials_idx = faces.materials_idx
+    verts_uvs = aux.verts_uvs
+    num_materials = torch.unique(materials_idx).shape[0]
+    new_verts_uvs = verts_uvs.clone()
+    for material_id in range(num_materials):
+        # apply offsets to horizontal axis
+        faces_ids = textures_ids[materials_idx == material_id].unique()
+        new_verts_uvs[faces_ids, 0] += material_id
+    new_verts_uvs[:, 0] /= num_materials
+    init_texture_tensor = transforms.ToTensor()(init_texture)
+    init_texture_tensor = torch.cat([init_texture_tensor for _ in range(num_materials)], dim=-1)
+    init_texture = transforms.ToPILImage()(init_texture_tensor).resize((uv_size, uv_size))
+    return new_verts_uvs, init_texture
+@torch.no_grad()
+def update_face_angles(mesh, cameras, fragments):
+    def get_angle(x, y):
+        x = torch.nn.functional.normalize(x)
+        y = torch.nn.functional.normalize(y)
+        inner_product = (x * y).sum(dim=1)
+        x_norm = x.pow(2).sum(dim=1).pow(0.5)
+        y_norm = y.pow(2).sum(dim=1).pow(0.5)
+        cos = inner_product / (x_norm * y_norm)
+        angle = torch.acos(cos)
+        angle = angle * 180 / 3.14159
+        return angle
+    # face normals
+    face_normals = mesh.faces_normals_padded()[0]
+    # view vector (object center -> camera center)
+    camera_center = cameras.get_camera_center()
+    face_angles = get_angle(
+        face_normals,
+        camera_center.repeat(face_normals.shape[0], 1)
+    ) # (F)
+    face_angles_rev = get_angle(
+        face_normals,
+        -camera_center.repeat(face_normals.shape[0], 1)
+    ) # (F)
+    face_angles = torch.minimum(face_angles, face_angles_rev)
+    # Indices of unique visible faces
+    visible_map = fragments.pix_to_face.unique()  # (num_visible_faces)
+    invisible_mask = torch.ones_like(face_angles)
+    invisible_mask[visible_map] = 0
+    face_angles[invisible_mask == 1] = 10000.  # angles of invisible faces are ignored
+    return face_angles

text2tex/lib/projection_helper.py ADDED Viewed

	@@ -0,0 +1,464 @@

+import os
+import torch
+import cv2
+import random
+import numpy as np
+from torchvision import transforms
+from pytorch3d.renderer import TexturesUV
+from pytorch3d.ops import interpolate_face_attributes
+from PIL import Image
+from tqdm import tqdm
+# customized
+import sys
+sys.path.append(".")
+from lib.camera_helper import init_camera
+from lib.render_helper import init_renderer, render
+from lib.shading_helper import (
+    BlendParams,
+    init_soft_phong_shader,
+    init_flat_texel_shader,
+)
+from lib.vis_helper import visualize_outputs, visualize_quad_mask
+from lib.constants import *
+def get_all_4_locations(values_y, values_x):
+    y_0 = torch.floor(values_y)
+    y_1 = torch.ceil(values_y)
+    x_0 = torch.floor(values_x)
+    x_1 = torch.ceil(values_x)
+    return torch.cat([y_0, y_0, y_1, y_1], 0).long(), torch.cat([x_0, x_1, x_0, x_1], 0).long()
+def compose_quad_mask(new_mask_image, update_mask_image, old_mask_image, device):
+    """
+        compose quad mask:
+            -> 0: background
+            -> 1: old
+            -> 2: update
+            -> 3: new
+    """
+    new_mask_tensor = transforms.ToTensor()(new_mask_image).to(device)
+    update_mask_tensor = transforms.ToTensor()(update_mask_image).to(device)
+    old_mask_tensor = transforms.ToTensor()(old_mask_image).to(device)
+    all_mask_tensor = new_mask_tensor + update_mask_tensor + old_mask_tensor
+    quad_mask_tensor = torch.zeros_like(all_mask_tensor)
+    quad_mask_tensor[old_mask_tensor == 1] = 1
+    quad_mask_tensor[update_mask_tensor == 1] = 2
+    quad_mask_tensor[new_mask_tensor == 1] = 3
+    return old_mask_tensor, update_mask_tensor, new_mask_tensor, all_mask_tensor, quad_mask_tensor
+def compute_view_heat(similarity_tensor, quad_mask_tensor):
+    num_total_pixels = quad_mask_tensor.reshape(-1).shape[0]
+    heat = 0
+    for idx in QUAD_WEIGHTS:
+        heat += (quad_mask_tensor == idx).sum() * QUAD_WEIGHTS[idx] / num_total_pixels
+    return heat
+def select_viewpoint(selected_view_ids, view_punishments,
+    mode, dist_list, elev_list, azim_list, sector_list, view_idx,
+    similarity_texture_cache, exist_texture,
+    mesh, faces, verts_uvs,
+    image_size, faces_per_pixel,
+    init_image_dir, mask_image_dir, normal_map_dir, depth_map_dir, similarity_map_dir,
+    device, use_principle=False
+):
+    if mode == "sequential":
+        num_views = len(dist_list)
+        dist = dist_list[view_idx % num_views]
+        elev = elev_list[view_idx % num_views]
+        azim = azim_list[view_idx % num_views]
+        sector = sector_list[view_idx % num_views]
+        selected_view_ids.append(view_idx % num_views)
+    elif mode == "heuristic":
+        if use_principle and view_idx < 6:
+            selected_view_idx = view_idx
+        else:
+            selected_view_idx = None
+            max_heat = 0
+            print("=> selecting next view...")
+            view_heat_list = []
+            for sample_idx in tqdm(range(len(dist_list))):
+                view_heat, *_ = render_one_view_and_build_masks(dist_list[sample_idx], elev_list[sample_idx], azim_list[sample_idx],
+                    sample_idx, sample_idx, view_punishments,
+                    similarity_texture_cache, exist_texture,
+                    mesh, faces, verts_uvs,
+                    image_size, faces_per_pixel,
+                    init_image_dir, mask_image_dir, normal_map_dir, depth_map_dir, similarity_map_dir,
+                    device)
+                if view_heat > max_heat:
+                    selected_view_idx = sample_idx
+                    max_heat = view_heat
+                view_heat_list.append(view_heat.item())
+            print(view_heat_list)
+            print("select view {} with heat {}".format(selected_view_idx, max_heat))
+        dist = dist_list[selected_view_idx]
+        elev = elev_list[selected_view_idx]
+        azim = azim_list[selected_view_idx]
+        sector = sector_list[selected_view_idx]
+        selected_view_ids.append(selected_view_idx)
+        view_punishments[selected_view_idx] *= 0.01
+    elif mode == "random":
+        selected_view_idx = random.choice(range(len(dist_list)))
+        dist = dist_list[selected_view_idx]
+        elev = elev_list[selected_view_idx]
+        azim = azim_list[selected_view_idx]
+        sector = sector_list[selected_view_idx]
+        selected_view_ids.append(selected_view_idx)
+    else:
+        raise NotImplementedError()
+    return dist, elev, azim, sector, selected_view_ids, view_punishments
+@torch.no_grad()
+def build_backproject_mask(mesh, faces, verts_uvs,
+    cameras, reference_image, faces_per_pixel,
+    image_size, uv_size, device):
+    # construct pixel UVs
+    renderer_scaled = init_renderer(cameras,
+        shader=init_soft_phong_shader(
+            camera=cameras,
+            blend_params=BlendParams(),
+            device=device),
+        image_size=image_size,
+        faces_per_pixel=faces_per_pixel
+    )
+    fragments_scaled = renderer_scaled.rasterizer(mesh)
+    # get UV coordinates for each pixel
+    faces_verts_uvs = verts_uvs[faces.textures_idx]
+    pixel_uvs = interpolate_face_attributes(
+        fragments_scaled.pix_to_face, fragments_scaled.bary_coords, faces_verts_uvs
+    )  # NxHsxWsxKx2
+    pixel_uvs = pixel_uvs.permute(0, 3, 1, 2, 4).reshape(-1, 2)
+    texture_locations_y, texture_locations_x = get_all_4_locations(
+        (1 - pixel_uvs[:, 1]).reshape(-1) * (uv_size - 1),
+        pixel_uvs[:, 0].reshape(-1) * (uv_size - 1)
+    )
+    K = faces_per_pixel
+    texture_values = torch.from_numpy(np.array(reference_image.resize((image_size, image_size)))).float() / 255.
+    texture_values = texture_values.to(device).unsqueeze(0).expand([4, -1, -1, -1]).unsqueeze(0).expand([K, -1, -1, -1, -1])
+    # texture
+    texture_tensor = torch.zeros(uv_size, uv_size, 3).to(device)
+    texture_tensor[texture_locations_y, texture_locations_x, :] = texture_values.reshape(-1, 3)
+    return texture_tensor[:, :, 0]
+@torch.no_grad()
+def build_diffusion_mask(mesh_stuff,
+    renderer, exist_texture, similarity_texture_cache, target_value, device, image_size,
+    smooth_mask=False, view_threshold=0.01):
+    mesh, faces, verts_uvs = mesh_stuff
+    mask_mesh = mesh.clone() # NOTE in-place operation - DANGER!!!
+    # visible mask => the whole region
+    exist_texture_expand = exist_texture.unsqueeze(0).unsqueeze(-1).expand(-1, -1, -1, 3).to(device)
+    mask_mesh.textures = TexturesUV(
+        maps=torch.ones_like(exist_texture_expand),
+        faces_uvs=faces.textures_idx[None, ...],
+        verts_uvs=verts_uvs[None, ...],
+        sampling_mode="nearest"
+    )
+    # visible_mask_tensor, *_ = render(mask_mesh, renderer)
+    visible_mask_tensor, _, similarity_map_tensor, *_ = render(mask_mesh, renderer)
+    # faces that are too rotated away from the viewpoint will be treated as invisible
+    valid_mask_tensor = (similarity_map_tensor >= view_threshold).float()
+    visible_mask_tensor *= valid_mask_tensor
+    # nonexist mask <=> new mask
+    exist_texture_expand = exist_texture.unsqueeze(0).unsqueeze(-1).expand(-1, -1, -1, 3).to(device)
+    mask_mesh.textures = TexturesUV(
+        maps=1 - exist_texture_expand,
+        faces_uvs=faces.textures_idx[None, ...],
+        verts_uvs=verts_uvs[None, ...],
+        sampling_mode="nearest"
+    )
+    new_mask_tensor, *_ = render(mask_mesh, renderer)
+    new_mask_tensor *= valid_mask_tensor
+    # exist mask => visible mask - new mask
+    exist_mask_tensor = visible_mask_tensor - new_mask_tensor
+    exist_mask_tensor[exist_mask_tensor < 0] = 0 # NOTE dilate can lead to overflow
+    # all update mask
+    mask_mesh.textures = TexturesUV(
+        maps=(
+            similarity_texture_cache.argmax(0) == target_value
+            # # only consider the views that have already appeared before
+            # similarity_texture_cache[0:target_value+1].argmax(0) == target_value
+        ).float().unsqueeze(0).unsqueeze(-1).expand(-1, -1, -1, 3).to(device),
+        faces_uvs=faces.textures_idx[None, ...],
+        verts_uvs=verts_uvs[None, ...],
+        sampling_mode="nearest"
+    )
+    all_update_mask_tensor, *_ = render(mask_mesh, renderer)
+    # current update mask => intersection between all update mask and exist mask
+    update_mask_tensor = exist_mask_tensor * all_update_mask_tensor
+    # keep mask => exist mask - update mask
+    old_mask_tensor = exist_mask_tensor - update_mask_tensor
+    # convert
+    new_mask = new_mask_tensor[0].cpu().float().permute(2, 0, 1)
+    new_mask = transforms.ToPILImage()(new_mask).convert("L")
+    update_mask = update_mask_tensor[0].cpu().float().permute(2, 0, 1)
+    update_mask = transforms.ToPILImage()(update_mask).convert("L")
+    old_mask = old_mask_tensor[0].cpu().float().permute(2, 0, 1)
+    old_mask = transforms.ToPILImage()(old_mask).convert("L")
+    exist_mask = exist_mask_tensor[0].cpu().float().permute(2, 0, 1)
+    exist_mask = transforms.ToPILImage()(exist_mask).convert("L")
+    return new_mask, update_mask, old_mask, exist_mask
+@torch.no_grad()
+def render_one_view(mesh,
+    dist, elev, azim,
+    image_size, faces_per_pixel,
+    device):
+    # render the view
+    cameras = init_camera(
+        dist, elev, azim,
+        image_size, device
+    )
+    renderer = init_renderer(cameras,
+        shader=init_soft_phong_shader(
+            camera=cameras,
+            blend_params=BlendParams(),
+            device=device),
+        image_size=image_size,
+        faces_per_pixel=faces_per_pixel
+    )
+    init_images_tensor, normal_maps_tensor, similarity_tensor, depth_maps_tensor, fragments = render(mesh, renderer)
+    return (
+        cameras, renderer,
+        init_images_tensor, normal_maps_tensor, similarity_tensor, depth_maps_tensor, fragments
+    )
+@torch.no_grad()
+def build_similarity_texture_cache_for_all_views(mesh, faces, verts_uvs,
+    dist_list, elev_list, azim_list,
+    image_size, image_size_scaled, uv_size, faces_per_pixel,
+    device):
+    num_candidate_views = len(dist_list)
+    similarity_texture_cache = torch.zeros(num_candidate_views, uv_size, uv_size).to(device)
+    print("=> building similarity texture cache for all views...")
+    for i in tqdm(range(num_candidate_views)):
+        cameras, _, _, _, similarity_tensor, _, _ = render_one_view(mesh,
+            dist_list[i], elev_list[i], azim_list[i],
+            image_size, faces_per_pixel, device)
+        similarity_texture_cache[i] = build_backproject_mask(mesh, faces, verts_uvs,
+            cameras, transforms.ToPILImage()(similarity_tensor[0, :, :, 0]).convert("RGB"), faces_per_pixel,
+            image_size_scaled, uv_size, device)
+    return similarity_texture_cache
+@torch.no_grad()
+def render_one_view_and_build_masks(dist, elev, azim,
+    selected_view_idx, view_idx, view_punishments,
+    similarity_texture_cache, exist_texture,
+    mesh, faces, verts_uvs,
+    image_size, faces_per_pixel,
+    init_image_dir, mask_image_dir, normal_map_dir, depth_map_dir, similarity_map_dir,
+    device, save_intermediate=False, smooth_mask=False, view_threshold=0.01):
+    # render the view
+    (
+        cameras, renderer,
+        init_images_tensor, normal_maps_tensor, similarity_tensor, depth_maps_tensor, fragments
+    ) = render_one_view(mesh,
+        dist, elev, azim,
+        image_size, faces_per_pixel,
+        device
+    )
+    init_image = init_images_tensor[0].cpu()
+    init_image = init_image.permute(2, 0, 1)
+    init_image = transforms.ToPILImage()(init_image).convert("RGB")
+    normal_map = normal_maps_tensor[0].cpu()
+    normal_map = normal_map.permute(2, 0, 1)
+    normal_map = transforms.ToPILImage()(normal_map).convert("RGB")
+    depth_map = depth_maps_tensor[0].cpu().numpy()
+    depth_map = Image.fromarray(depth_map).convert("L")
+    similarity_map = similarity_tensor[0, :, :, 0].cpu()
+    similarity_map = transforms.ToPILImage()(similarity_map).convert("L")
+    flat_renderer = init_renderer(cameras,
+        shader=init_flat_texel_shader(
+            camera=cameras,
+            device=device),
+        image_size=image_size,
+        faces_per_pixel=faces_per_pixel
+    )
+    new_mask_image, update_mask_image, old_mask_image, exist_mask_image = build_diffusion_mask(
+        (mesh, faces, verts_uvs),
+        flat_renderer, exist_texture, similarity_texture_cache, selected_view_idx, device, image_size,
+        smooth_mask=smooth_mask, view_threshold=view_threshold
+    )
+    # NOTE the view idx is the absolute idx in the sample space (i.e. `selected_view_idx`)
+    # it should match with `similarity_texture_cache`
+    (
+        old_mask_tensor,
+        update_mask_tensor,
+        new_mask_tensor,
+        all_mask_tensor,
+        quad_mask_tensor
+    ) = compose_quad_mask(new_mask_image, update_mask_image, old_mask_image, device)
+    view_heat = compute_view_heat(similarity_tensor, quad_mask_tensor)
+    view_heat *= view_punishments[selected_view_idx]
+    # save intermediate results
+    if save_intermediate:
+        init_image.save(os.path.join(init_image_dir, "{}.png".format(view_idx)))
+        normal_map.save(os.path.join(normal_map_dir, "{}.png".format(view_idx)))
+        depth_map.save(os.path.join(depth_map_dir, "{}.png".format(view_idx)))
+        similarity_map.save(os.path.join(similarity_map_dir, "{}.png".format(view_idx)))
+        new_mask_image.save(os.path.join(mask_image_dir, "{}_new.png".format(view_idx)))
+        update_mask_image.save(os.path.join(mask_image_dir, "{}_update.png".format(view_idx)))
+        old_mask_image.save(os.path.join(mask_image_dir, "{}_old.png".format(view_idx)))
+        exist_mask_image.save(os.path.join(mask_image_dir, "{}_exist.png".format(view_idx)))
+        visualize_quad_mask(mask_image_dir, quad_mask_tensor, view_idx, view_heat, device)
+    return (
+        view_heat,
+        renderer, cameras, fragments,
+        init_image, normal_map, depth_map,
+        init_images_tensor, normal_maps_tensor, depth_maps_tensor, similarity_tensor,
+        old_mask_image, update_mask_image, new_mask_image,
+        old_mask_tensor, update_mask_tensor, new_mask_tensor, all_mask_tensor, quad_mask_tensor
+    )
+@torch.no_grad()
+def backproject_from_image(mesh, faces, verts_uvs, cameras,
+    reference_image, new_mask_image, update_mask_image,
+    init_texture, exist_texture,
+    image_size, uv_size, faces_per_pixel,
+    device):
+    # construct pixel UVs
+    renderer_scaled = init_renderer(cameras,
+        shader=init_soft_phong_shader(
+            camera=cameras,
+            blend_params=BlendParams(),
+            device=device),
+        image_size=image_size,
+        faces_per_pixel=faces_per_pixel
+    )
+    fragments_scaled = renderer_scaled.rasterizer(mesh)
+    # get UV coordinates for each pixel
+    faces_verts_uvs = verts_uvs[faces.textures_idx]
+    pixel_uvs = interpolate_face_attributes(
+        fragments_scaled.pix_to_face, fragments_scaled.bary_coords, faces_verts_uvs
+    )  # NxHsxWsxKx2
+    pixel_uvs = pixel_uvs.permute(0, 3, 1, 2, 4).reshape(pixel_uvs.shape[-2], pixel_uvs.shape[1], pixel_uvs.shape[2], 2)
+    # the update mask has to be on top of the diffusion mask
+    new_mask_image_tensor = transforms.ToTensor()(new_mask_image).to(device).unsqueeze(-1)
+    update_mask_image_tensor = transforms.ToTensor()(update_mask_image).to(device).unsqueeze(-1)
+    project_mask_image_tensor = torch.logical_or(update_mask_image_tensor, new_mask_image_tensor).float()
+    project_mask_image = project_mask_image_tensor * 255.
+    project_mask_image = Image.fromarray(project_mask_image[0, :, :, 0].cpu().numpy().astype(np.uint8))
+    project_mask_image_scaled = project_mask_image.resize(
+        (image_size, image_size),
+        Image.Resampling.NEAREST
+    )
+    project_mask_image_tensor_scaled = transforms.ToTensor()(project_mask_image_scaled).to(device)
+    pixel_uvs_masked = pixel_uvs[project_mask_image_tensor_scaled == 1]
+    texture_locations_y, texture_locations_x = get_all_4_locations(
+        (1 - pixel_uvs_masked[:, 1]).reshape(-1) * (uv_size - 1),
+        pixel_uvs_masked[:, 0].reshape(-1) * (uv_size - 1)
+    )
+    K = pixel_uvs.shape[0]
+    project_mask_image_tensor_scaled = project_mask_image_tensor_scaled[:, None, :, :, None].repeat(1, 4, 1, 1, 3)
+    texture_values = torch.from_numpy(np.array(reference_image.resize((image_size, image_size))))
+    texture_values = texture_values.to(device).unsqueeze(0).expand([4, -1, -1, -1]).unsqueeze(0).expand([K, -1, -1, -1, -1])
+    texture_values_masked = texture_values.reshape(-1, 3)[project_mask_image_tensor_scaled.reshape(-1, 3) == 1].reshape(-1, 3)
+    # texture
+    texture_tensor = torch.from_numpy(np.array(init_texture)).to(device)
+    texture_tensor[texture_locations_y, texture_locations_x, :] = texture_values_masked
+    init_texture = Image.fromarray(texture_tensor.cpu().numpy().astype(np.uint8))
+    # update texture cache
+    exist_texture[texture_locations_y, texture_locations_x] = 1
+    return init_texture, project_mask_image, exist_texture

text2tex/lib/render_helper.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import torch
+import cv2
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from pytorch3d.ops import interpolate_face_attributes
+from pytorch3d.renderer import (
+    RasterizationSettings,
+    MeshRendererWithFragments,
+    MeshRasterizer,
+)
+# customized
+import sys
+sys.path.append(".")
+def init_renderer(camera, shader, image_size, faces_per_pixel):
+    raster_settings = RasterizationSettings(image_size=image_size, faces_per_pixel=faces_per_pixel)
+    renderer = MeshRendererWithFragments(
+        rasterizer=MeshRasterizer(
+            cameras=camera,
+            raster_settings=raster_settings
+        ),
+        shader=shader
+    )
+    return renderer
+@torch.no_grad()
+def render(mesh, renderer, pad_value=10):
+    def phong_normal_shading(meshes, fragments) -> torch.Tensor:
+        faces = meshes.faces_packed()  # (F, 3)
+        vertex_normals = meshes.verts_normals_packed()  # (V, 3)
+        faces_normals = vertex_normals[faces]
+        pixel_normals = interpolate_face_attributes(
+            fragments.pix_to_face, fragments.bary_coords, faces_normals
+        )
+        return pixel_normals
+    def similarity_shading(meshes, fragments):
+        faces = meshes.faces_packed()  # (F, 3)
+        vertex_normals = meshes.verts_normals_packed()  # (V, 3)
+        faces_normals = vertex_normals[faces]
+        vertices = meshes.verts_packed()  # (V, 3)
+        face_positions = vertices[faces]
+        view_directions = torch.nn.functional.normalize((renderer.shader.cameras.get_camera_center().reshape(1, 1, 3) - face_positions), p=2, dim=2)
+        cosine_similarity = torch.nn.CosineSimilarity(dim=2)(faces_normals, view_directions)
+        pixel_similarity = interpolate_face_attributes(
+            fragments.pix_to_face, fragments.bary_coords, cosine_similarity.unsqueeze(-1)
+        )
+        return pixel_similarity
+    def get_relative_depth_map(fragments, pad_value=pad_value):
+        absolute_depth = fragments.zbuf[..., 0] # B, H, W
+        no_depth = -1
+        depth_min, depth_max = absolute_depth[absolute_depth != no_depth].min(), absolute_depth[absolute_depth != no_depth].max()
+        target_min, target_max = 50, 255
+        depth_value = absolute_depth[absolute_depth != no_depth]
+        depth_value = depth_max - depth_value # reverse values
+        depth_value /= (depth_max - depth_min)
+        depth_value = depth_value * (target_max - target_min) + target_min
+        relative_depth = absolute_depth.clone()
+        relative_depth[absolute_depth != no_depth] = depth_value
+        relative_depth[absolute_depth == no_depth] = pad_value # not completely black
+        return relative_depth
+    images, fragments = renderer(mesh)
+    normal_maps = phong_normal_shading(mesh, fragments).squeeze(-2)
+    similarity_maps = similarity_shading(mesh, fragments).squeeze(-2) # -1 - 1
+    depth_maps = get_relative_depth_map(fragments)
+    # normalize similarity mask to 0 - 1
+    similarity_maps = torch.abs(similarity_maps) # 0 - 1
+    # HACK erode, eliminate isolated dots
+    non_zero_similarity = (similarity_maps > 0).float()
+    non_zero_similarity = (non_zero_similarity * 255.).cpu().numpy().astype(np.uint8)[0]
+    non_zero_similarity = cv2.erode(non_zero_similarity, kernel=np.ones((3, 3), np.uint8), iterations=2)
+    non_zero_similarity = torch.from_numpy(non_zero_similarity).to(similarity_maps.device).unsqueeze(0) / 255.
+    similarity_maps = non_zero_similarity.unsqueeze(-1) * similarity_maps
+    return images, normal_maps, similarity_maps, depth_maps, fragments
+@torch.no_grad()
+def check_visible_faces(mesh, fragments):
+    pix_to_face = fragments.pix_to_face
+    # Indices of unique visible faces
+    visible_map = pix_to_face.unique()  # (num_visible_faces)
+    return visible_map