TRELLIS-Single3D

Runtime error

App Files Files Community

gokaygokay commited on Nov 12, 2024

Commit

e62f618

1 Parent(s): 565c7be

delete

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +2 -3
app.py +0 -284
assets/logo.png +0 -0
assets/overview_3.png +0 -0
assets/radar.png +0 -0
assets/runtime.png +0 -0
assets/teaser.png +0 -3
demos/example_000.png +0 -0
demos/example_001.png +0 -0
demos/example_002.png +0 -0
demos/example_003.png +0 -3
demos/example_list.txt +0 -2
infer/__init__.py +0 -28
infer/gif_render.py +0 -55
infer/image_to_views.py +0 -81
infer/rembg.py +0 -26
infer/text_to_image.py +0 -80
infer/utils.py +0 -77
infer/views_to_mesh.py +0 -94
mvd/__init__.py +0 -0
mvd/hunyuan3d_mvd_lite_pipeline.py +0 -493
mvd/hunyuan3d_mvd_std_pipeline.py +0 -471
mvd/utils.py +0 -85
requirements.txt +0 -22
scripts/image_to_3d.sh +0 -8
scripts/image_to_3d_demo.sh +0 -8
scripts/image_to_3d_fast.sh +0 -6
scripts/image_to_3d_fast_demo.sh +0 -6
scripts/text_to_3d.sh +0 -7
scripts/text_to_3d_demo.sh +0 -7
scripts/text_to_3d_fast.sh +0 -6
scripts/text_to_3d_fast_demo.sh +0 -6
svrm/.DS_Store +0 -0
svrm/configs/2024-10-24T22-36-18-project.yaml +0 -32
svrm/configs/svrm.yaml +0 -32
svrm/ldm/.DS_Store +0 -0
svrm/ldm/models/svrm.py +0 -263
svrm/ldm/modules/attention.py +0 -457
svrm/ldm/modules/encoders/__init__.py +0 -0
svrm/ldm/modules/encoders/dinov2/__init__.py +0 -0
svrm/ldm/modules/encoders/dinov2/hub/__init__.py +0 -0
svrm/ldm/modules/encoders/dinov2/hub/backbones.py +0 -156
svrm/ldm/modules/encoders/dinov2/hub/utils.py +0 -39
svrm/ldm/modules/encoders/dinov2/layers/__init__.py +0 -11
svrm/ldm/modules/encoders/dinov2/layers/attention.py +0 -89
svrm/ldm/modules/encoders/dinov2/layers/block.py +0 -269
svrm/ldm/modules/encoders/dinov2/layers/dino_head.py +0 -58
svrm/ldm/modules/encoders/dinov2/layers/drop_path.py +0 -34
svrm/ldm/modules/encoders/dinov2/layers/layer_scale.py +0 -27
svrm/ldm/modules/encoders/dinov2/layers/mlp.py +0 -40

README.md CHANGED Viewed

@@ -1,11 +1,10 @@
 ---
-title: Hunyuan3D-1.0
 emoji: 😻
 colorFrom: purple
 colorTo: red
 sdk: gradio
-sdk_version: 4.42.0
 app_file: app.py
 pinned: false
-short_description: Text-to-3D and Image-to-3D Generation
 ---

 ---
+title: Image Procesing
 emoji: 😻
 colorFrom: purple
 colorTo: red
 sdk: gradio
+sdk_version: 5.3.0
 app_file: app.py
 pinned: false
 ---

app.py DELETED Viewed

@@ -1,284 +0,0 @@
-import os
-import warnings
-from huggingface_hub import hf_hub_download
-import gradio as gr
-from glob import glob
-import shutil
-import torch
-import numpy as np
-from PIL import Image
-from einops import rearrange
-import argparse
-# Suppress warnings
-warnings.simplefilter('ignore', category=UserWarning)
-warnings.simplefilter('ignore', category=FutureWarning)
-warnings.simplefilter('ignore', category=DeprecationWarning)
-def download_models():
-    # Create weights directory if it doesn't exist
-    os.makedirs("weights", exist_ok=True)
-    os.makedirs("weights/hunyuanDiT", exist_ok=True)
-    # Download Hunyuan3D-1 model
-    try:
-        hf_hub_download(
-            repo_id="tencent/Hunyuan3D-1",
-            local_dir="./weights",
-            resume_download=True
-        )
-        print("Successfully downloaded Hunyuan3D-1 model")
-    except Exception as e:
-        print(f"Error downloading Hunyuan3D-1: {e}")
-    # Download HunyuanDiT model
-    try:
-        hf_hub_download(
-            repo_id="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
-            local_dir="./weights/hunyuanDiT",
-            resume_download=True
-        )
-        print("Successfully downloaded HunyuanDiT model")
-    except Exception as e:
-        print(f"Error downloading HunyuanDiT: {e}")
-# Download models before starting the app
-download_models()
-# Parse arguments
-parser = argparse.ArgumentParser()
-parser.add_argument("--use_lite", default=False, action="store_true")
-parser.add_argument("--mv23d_cfg_path", default="./svrm/configs/svrm.yaml", type=str)
-parser.add_argument("--mv23d_ckt_path", default="weights/svrm/svrm.safetensors", type=str)
-parser.add_argument("--text2image_path", default="weights/hunyuanDiT", type=str)
-parser.add_argument("--save_memory", default=False, action="store_true")
-parser.add_argument("--device", default="cuda:0", type=str)
-args = parser.parse_args()
-# Constants
-CONST_PORT = 8080
-CONST_MAX_QUEUE = 1
-CONST_SERVER = '0.0.0.0'
-CONST_HEADER = '''
-<h2><b>Official 🤗 Gradio Demo</b></h2>
-<h2><a href='https://github.com/tencent/Hunyuan3D-1' target='_blank'>
-<b>Hunyuan3D-1.0: A Unified Framework for Text-to-3D and Image-to-3D Generation</b></a></h2>
-'''
-# Helper functions
-def get_example_img_list():
-    print('Loading example img list ...')
-    return sorted(glob('./demos/example_*.png'))
-def get_example_txt_list():
-    print('Loading example txt list ...')
-    txt_list = []
-    for line in open('./demos/example_list.txt'):
-        txt_list.append(line.strip())
-    return txt_list
-example_is = get_example_img_list()
-example_ts = get_example_txt_list()
-# Import required workers
-from infer import seed_everything, save_gif
-from infer import Text2Image, Removebg, Image2Views, Views2Mesh, GifRenderer
-# Initialize workers
-worker_xbg = Removebg()
-print(f"loading {args.text2image_path}")
-worker_t2i = Text2Image(
-    pretrain=args.text2image_path,
-    device=args.device,
-    save_memory=args.save_memory
-)
-worker_i2v = Image2Views(
-    use_lite=args.use_lite,
-    device=args.device
-)
-worker_v23 = Views2Mesh(
-    args.mv23d_cfg_path,
-    args.mv23d_ckt_path,
-    use_lite=args.use_lite,
-    device=args.device
-)
-worker_gif = GifRenderer(args.device)
-# Pipeline stages
-def stage_0_t2i(text, image, seed, step):
-    os.makedirs('./outputs/app_output', exist_ok=True)
-    exists = set(int(_) for _ in os.listdir('./outputs/app_output') if not _.startswith("."))
-    cur_id = min(set(range(30)) - exists) if len(exists) < 30 else 0
-    if os.path.exists(f"./outputs/app_output/{(cur_id + 1) % 30}"):
-        shutil.rmtree(f"./outputs/app_output/{(cur_id + 1) % 30}")
-    save_folder = f'./outputs/app_output/{cur_id}'
-    os.makedirs(save_folder, exist_ok=True)
-    dst = save_folder + '/img.png'
-    if not text:
-        if image is None:
-            return dst, save_folder
-        image.save(dst)
-        return dst, save_folder
-    image = worker_t2i(text, seed, step)
-    image.save(dst)
-    dst = worker_xbg(image, save_folder)
-    return dst, save_folder
-def stage_1_xbg(image, save_folder):
-    if isinstance(image, str):
-        image = Image.open(image)
-    dst = save_folder + '/img_nobg.png'
-    rgba = worker_xbg(image)
-    rgba.save(dst)
-    return dst
-def stage_2_i2v(image, seed, step, save_folder):
-    if isinstance(image, str):
-        image = Image.open(image)
-    gif_dst = save_folder + '/views.gif'
-    res_img, pils = worker_i2v(image, seed, step)
-    save_gif(pils, gif_dst)
-    views_img, cond_img = res_img[0], res_img[1]
-    img_array = np.asarray(views_img, dtype=np.uint8)
-    show_img = rearrange(img_array, '(n h) (m w) c -> (n m) h w c', n=3, m=2)
-    show_img = show_img[worker_i2v.order, ...]
-    show_img = rearrange(show_img, '(n m) h w c -> (n h) (m w) c', n=2, m=3)
-    show_img = Image.fromarray(show_img)
-    return views_img, cond_img, show_img
-def stage_3_v23(views_pil, cond_pil, seed, save_folder, target_face_count=30000,
-                do_texture_mapping=True, do_render=True):
-    do_texture_mapping = do_texture_mapping or do_render
-    obj_dst = save_folder + '/mesh_with_colors.obj'
-    glb_dst = save_folder + '/mesh.glb'
-    worker_v23(
-        views_pil,
-        cond_pil,
-        seed=seed,
-        save_folder=save_folder,
-        target_face_count=target_face_count,
-        do_texture_mapping=do_texture_mapping
-    )
-    return obj_dst, glb_dst
-def stage_4_gif(obj_dst, save_folder, do_render_gif=True):
-    if not do_render_gif:
-        return None
-    gif_dst = save_folder + '/output.gif'
-    worker_gif(
-        save_folder + '/mesh.obj',
-        gif_dst_path=gif_dst
-    )
-    return gif_dst
-# Gradio Interface
-with gr.Blocks() as demo:
-    gr.Markdown(CONST_HEADER)
-    with gr.Row(variant="panel"):
-        with gr.Column(scale=2):
-            with gr.Tab("Text to 3D"):
-                with gr.Column():
-                    text = gr.TextArea('一只黑白相间的熊猫在白色背景上居中坐着，呈现出卡通风格和可爱氛围。',
-                                     lines=1, max_lines=10, label='Input text')
-                    with gr.Row():
-                        textgen_seed = gr.Number(value=0, label="T2I seed", precision=0)
-                        textgen_step = gr.Number(value=25, label="T2I step", precision=0)
-                        textgen_SEED = gr.Number(value=0, label="Gen seed", precision=0)
-                        textgen_STEP = gr.Number(value=50, label="Gen step", precision=0)
-                        textgen_max_faces = gr.Number(value=90000, label="max number of faces", precision=0)
-                    with gr.Row():
-                        textgen_do_texture_mapping = gr.Checkbox(label="texture mapping", value=False)
-                        textgen_do_render_gif = gr.Checkbox(label="Render gif", value=False)
-                        textgen_submit = gr.Button("Generate", variant="primary")
-                    gr.Examples(examples=example_ts, inputs=[text], label="Txt examples")
-            with gr.Tab("Image to 3D"):
-                with gr.Column():
-                    input_image = gr.Image(label="Input image", width=256, height=256,
-                                         type="pil", image_mode="RGBA", sources="upload")
-                    with gr.Row():
-                        imggen_SEED = gr.Number(value=0, label="Gen seed", precision=0)
-                        imggen_STEP = gr.Number(value=50, label="Gen step", precision=0)
-                        imggen_max_faces = gr.Number(value=90000, label="max number of faces", precision=0)
-                    with gr.Row():
-                        imggen_do_texture_mapping = gr.Checkbox(label="texture mapping", value=False)
-                        imggen_do_render_gif = gr.Checkbox(label="Render gif", value=False)
-                        imggen_submit = gr.Button("Generate", variant="primary")
-                    gr.Examples(examples=example_is, inputs=[input_image], label="Img examples")
-        with gr.Column(scale=3):
-            with gr.Tab("rembg image"):
-                rem_bg_image = gr.Image(label="No background image", width=256, height=256,
-                                      type="pil", image_mode="RGBA")
-            with gr.Tab("Multi views"):
-                result_image = gr.Image(label="Multi views", type="pil")
-            with gr.Tab("Obj"):
-                result_3dobj = gr.Model3D(label="Output obj")
-            with gr.Tab("Glb"):
-                result_3dglb = gr.Model3D(label="Output glb")
-            with gr.Tab("GIF"):
-                result_gif = gr.Image(label="Rendered GIF")
-    # States
-    none = gr.State(None)
-    save_folder = gr.State()
-    cond_image = gr.State()
-    views_image = gr.State()
-    text_image = gr.State()
-    # Event handlers
-    textgen_submit.click(
-        fn=stage_0_t2i,
-        inputs=[text, none, textgen_seed, textgen_step],
-        outputs=[rem_bg_image, save_folder],
-    ).success(
-        fn=stage_2_i2v,
-        inputs=[rem_bg_image, textgen_SEED, textgen_STEP, save_folder],
-        outputs=[views_image, cond_image, result_image],
-    ).success(
-        fn=stage_3_v23,
-        inputs=[views_image, cond_image, textgen_SEED, save_folder, textgen_max_faces,
-                textgen_do_texture_mapping, textgen_do_render_gif],
-        outputs=[result_3dobj, result_3dglb],
-    ).success(
-        fn=stage_4_gif,
-        inputs=[result_3dglb, save_folder, textgen_do_render_gif],
-        outputs=[result_gif],
-    )
-    imggen_submit.click(
-        fn=stage_0_t2i,
-        inputs=[none, input_image, textgen_seed, textgen_step],
-        outputs=[text_image, save_folder],
-    ).success(
-        fn=stage_1_xbg,
-        inputs=[text_image, save_folder],
-        outputs=[rem_bg_image],
-    ).success(
-        fn=stage_2_i2v,
-        inputs=[rem_bg_image, imggen_SEED, imggen_STEP, save_folder],
-        outputs=[views_image, cond_image, result_image],
-    ).success(
-        fn=stage_3_v23,
-        inputs=[views_image, cond_image, imggen_SEED, save_folder, imggen_max_faces,
-                imggen_do_texture_mapping, imggen_do_render_gif],
-        outputs=[result_3dobj, result_3dglb],
-    ).success(
-        fn=stage_4_gif,
-        inputs=[result_3dglb, save_folder, imggen_do_render_gif],
-        outputs=[result_gif],
-    )
-    demo.queue(max_size=CONST_MAX_QUEUE)
-    demo.launch(server_name=CONST_SERVER, server_port=CONST_PORT)

assets/logo.png DELETED Viewed

Binary file (314 kB)

assets/overview_3.png DELETED Viewed

Binary file (271 kB)

assets/radar.png DELETED Viewed

Binary file (122 kB)

assets/runtime.png DELETED Viewed

Binary file (38.4 kB)

assets/teaser.png DELETED Viewed

Git LFS Details

SHA256: af24eeebe39864d377b7ef8e11521a8b7cba964c14032cc28bd0d95bd5219c00
Pointer size: 132 Bytes
Size of remote file: 3.1 MB

demos/example_000.png DELETED Viewed

Binary file (659 kB)

demos/example_001.png DELETED Viewed

Binary file (817 kB)

demos/example_002.png DELETED Viewed

Binary file (339 kB)

demos/example_003.png DELETED Viewed

Git LFS Details

SHA256: d947e0ef10baf761abb78d2842519ae7428bc6eadab26a159510ddcaf2a47e67
Pointer size: 132 Bytes
Size of remote file: 1.07 MB

demos/example_list.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- a pot of green plants grows in a red flower pot.
2	- a lovely rabbit eating carrots

infer/__init__.py DELETED Viewed

@@ -1,28 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-from .utils import seed_everything, timing_decorator, auto_amp_inference
-from .rembg import Removebg
-from .text_to_image import Text2Image
-from .image_to_views import Image2Views, save_gif
-from .views_to_mesh import Views2Mesh
-from .gif_render import GifRenderer

infer/gif_render.py DELETED Viewed

@@ -1,55 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-from svrm.ldm.vis_util import render
-from .utils import seed_everything, timing_decorator
-class GifRenderer():
-    '''
-        render frame(s) of mesh using pytorch3d
-    '''
-    def __init__(self, device="cuda:0"):
-        self.device = device
-    @timing_decorator("gif render")
-    def __call__(
-        self,
-        obj_filename,
-        elev=0,
-        azim=0,
-        resolution=512,
-        gif_dst_path='',
-        n_views=120,
-        fps=30,
-        rgb=True
-    ):
-        render(
-            obj_filename,
-            elev=elev,
-            azim=azim,
-            resolution=resolution,
-            gif_dst_path=gif_dst_path,
-            n_views=n_views,
-            fps=fps,
-            device=self.device,
-            rgb=rgb
-        )

infer/image_to_views.py DELETED Viewed

@@ -1,81 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import os
-import time
-import torch
-import random
-import numpy as np
-from PIL import Image
-from einops import rearrange
-from PIL import Image, ImageSequence
-from .utils import seed_everything, timing_decorator, auto_amp_inference
-from .utils import get_parameter_number, set_parameter_grad_false
-from mvd.hunyuan3d_mvd_std_pipeline import HunYuan3D_MVD_Std_Pipeline
-from mvd.hunyuan3d_mvd_lite_pipeline import Hunyuan3d_MVD_Lite_Pipeline
-def save_gif(pils, save_path, df=False):
-    # save a list of PIL.Image to gif
-    spf = 4000 / len(pils)
-    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    pils[0].save(save_path, format="GIF", save_all=True, append_images=pils[1:], duration=spf, loop=0)
-    return save_path
-class Image2Views():
-    def __init__(self, device="cuda:0", use_lite=False):
-        self.device = device
-        if use_lite:
-            self.pipe = Hunyuan3d_MVD_Lite_Pipeline.from_pretrained(
-                "./weights/mvd_lite",
-                torch_dtype = torch.float16,
-                use_safetensors = True,
-            )
-        else:
-            self.pipe = HunYuan3D_MVD_Std_Pipeline.from_pretrained(
-                "./weights/mvd_std",
-                torch_dtype = torch.float16,
-                use_safetensors = True,
-            )
-        self.pipe = self.pipe.to(device)
-        self.order = [0, 1, 2, 3, 4, 5] if use_lite else [0, 2, 4, 5, 3, 1]
-        set_parameter_grad_false(self.pipe.unet)
-        print('image2views unet model', get_parameter_number(self.pipe.unet))
-    @torch.no_grad()
-    @timing_decorator("image to views")
-    @auto_amp_inference
-    def __call__(self, pil_img, seed=0, steps=50, guidance_scale=2.0, guidance_curve=lambda t:2.0):
-        seed_everything(seed)
-        generator = torch.Generator(device=self.device)
-        res_img = self.pipe(pil_img,
-                            num_inference_steps=steps,
-                            guidance_scale=guidance_scale,
-                            guidance_curve=guidance_curve,
-                            generat=generator).images
-        show_image = rearrange(np.asarray(res_img[0], dtype=np.uint8), '(n h) (m w) c -> (n m) h w c', n=3, m=2)
-        pils = [res_img[1]]+[Image.fromarray(show_image[idx]) for idx in self.order]
-        torch.cuda.empty_cache()
-        return res_img, pils

infer/rembg.py DELETED Viewed

@@ -1,26 +0,0 @@
-from rembg import remove, new_session
-from .utils import timing_decorator
-class Removebg():
-    def __init__(self, name="u2net"):
-        '''
-            name: rembg
-        '''
-        self.session = new_session(name)
-    @timing_decorator("remove background")
-    def __call__(self, rgb_img, force=False):
-        '''
-            inputs:
-                rgb_img: PIL.Image, with RGB mode expected
-                force: bool, input is RGBA mode
-            return:
-                rgba_img: PIL.Image with RGBA mode
-        '''
-        if rgb_img.mode == "RGBA":
-            if force:
-                rgb_img = rgb_img.convert("RGB")
-            else:
-                return rgb_img
-        rgba_img = remove(rgb_img, session=self.session)
-        return rgba_img

infer/text_to_image.py DELETED Viewed

@@ -1,80 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import torch
-from .utils import seed_everything, timing_decorator, auto_amp_inference
-from .utils import get_parameter_number, set_parameter_grad_false
-from diffusers import HunyuanDiTPipeline, AutoPipelineForText2Image
-class Text2Image():
-    def __init__(self, pretrain="weights/hunyuanDiT", device="cuda:0", save_memory=False):
-        '''
-            save_memory: if GPU memory is low, can set it
-        '''
-        self.save_memory = save_memory
-        self.device = device
-        self.pipe = AutoPipelineForText2Image.from_pretrained(
-            pretrain,
-            torch_dtype = torch.float16,
-            enable_pag = True,
-            pag_applied_layers = ["blocks.(16|17|18|19)"]
-        )
-        set_parameter_grad_false(self.pipe.transformer)
-        print('text2image transformer model', get_parameter_number(self.pipe.transformer))
-        if not save_memory:
-            self.pipe = self.pipe.to(device)
-        self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态,残缺,多余的手指,变异的手," \
-                       "画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学,糟糕的比例,多余的肢体,克隆的脸," \
-                       "毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿,额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
-    @torch.no_grad()
-    @timing_decorator('text to image')
-    @auto_amp_inference
-    def __call__(self, *args, **kwargs):
-        if self.save_memory:
-            self.pipe = self.pipe.to(self.device)
-            torch.cuda.empty_cache()
-            res = self.call(*args, **kwargs)
-            self.pipe = self.pipe.to("cpu")
-        else:
-            res = self.call(*args, **kwargs)
-        torch.cuda.empty_cache()
-        return res
-    def call(self, prompt, seed=0, steps=25):
-        '''
-            inputs:
-                prompr: str
-                seed: int
-                steps: int
-            return:
-                rgb: PIL.Image
-        '''
-        prompt = prompt + ",白色背景,3D风格,最佳质量"
-        seed_everything(seed)
-        generator = torch.Generator(device=self.device)
-        if seed is not None: generator = generator.manual_seed(int(seed))
-        rgb = self.pipe(prompt=prompt, negative_prompt=self.neg_txt, num_inference_steps=steps,
-            pag_scale=1.3, width=1024, height=1024, generator=generator, return_dict=False)[0][0]
-        torch.cuda.empty_cache()
-        return rgb

infer/utils.py DELETED Viewed

@@ -1,77 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import os
-import time
-import random
-import numpy as np
-import torch
-from torch.cuda.amp import autocast, GradScaler
-from functools import wraps
-def seed_everything(seed):
-    '''
-        seed everthing
-    '''
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    os.environ["PL_GLOBAL_SEED"] = str(seed)
-def timing_decorator(category: str):
-    '''
-        timing_decorator: record time
-    '''
-    def decorator(func):
-        func.call_count = 0
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            start_time = time.time()
-            result = func(*args, **kwargs)
-            end_time = time.time()
-            elapsed_time = end_time - start_time
-            func.call_count += 1
-            print(f"[HunYuan3D]-[{category}], cost time: {elapsed_time:.4f}s") # huiwen
-            return result
-        return wrapper
-    return decorator
-def auto_amp_inference(func):
-    '''
-        with torch.cuda.amp.autocast()"
-            xxx
-    '''
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        with autocast():
-            output = func(*args, **kwargs)
-        return output
-    return wrapper
-def get_parameter_number(model):
-    total_num = sum(p.numel() for p in model.parameters())
-    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    return {'Total': total_num, 'Trainable': trainable_num}
-def set_parameter_grad_false(model):
-    for p in model.parameters():
-        p.requires_grad = False

infer/views_to_mesh.py DELETED Viewed

@@ -1,94 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import os
-import time
-import torch
-import random
-import numpy as np
-from PIL import Image
-from einops import rearrange
-from PIL import Image, ImageSequence
-from .utils import seed_everything, timing_decorator, auto_amp_inference
-from .utils import get_parameter_number, set_parameter_grad_false
-from svrm.predictor import MV23DPredictor
-class Views2Mesh():
-    def __init__(self, mv23d_cfg_path, mv23d_ckt_path, device="cuda:0", use_lite=False):
-        '''
-            mv23d_cfg_path: config yaml file
-            mv23d_ckt_path: path to ckpt
-            use_lite:
-        '''
-        self.mv23d_predictor = MV23DPredictor(mv23d_ckt_path, mv23d_cfg_path, device=device)
-        self.mv23d_predictor.model.eval()
-        self.order = [0, 1, 2, 3, 4, 5] if use_lite else [0, 2, 4, 5, 3, 1]
-        set_parameter_grad_false(self.mv23d_predictor.model)
-        print('view2mesh model', get_parameter_number(self.mv23d_predictor.model))
-    @torch.no_grad()
-    @timing_decorator("views to mesh")
-    @auto_amp_inference
-    def __call__(
-        self,
-        views_pil=None,
-        cond_pil=None,
-        gif_pil=None,
-        seed=0,
-        target_face_count = 10000,
-        do_texture_mapping = True,
-        save_folder='./outputs/test'
-    ):
-        '''
-            can set views_pil, cond_pil simutaously or set gif_pil only
-            seed: int
-            target_face_count: int
-            save_folder: path to save mesh files
-        '''
-        save_dir = save_folder
-        os.makedirs(save_dir, exist_ok=True)
-        if views_pil is not None and cond_pil is not None:
-            show_image = rearrange(np.asarray(views_pil, dtype=np.uint8),
-                                   '(n h) (m w) c -> (n m) h w c', n=3, m=2)
-            views = [Image.fromarray(show_image[idx]) for idx in self.order]
-            image_list = [cond_pil]+ views
-            image_list = [img.convert('RGB') for img in image_list]
-        elif gif_pil is not None:
-            image_list = [img.convert('RGB') for img in ImageSequence.Iterator(gif_pil)]
-        image_input = image_list[0]
-        image_list = image_list[1:] + image_list[:1]
-        seed_everything(seed)
-        self.mv23d_predictor.predict(
-            image_list,
-            save_dir = save_dir,
-            image_input = image_input,
-            target_face_count = target_face_count,
-            do_texture_mapping = do_texture_mapping
-        )
-        torch.cuda.empty_cache()
-        return save_dir

mvd/__init__.py DELETED Viewed

File without changes

mvd/hunyuan3d_mvd_lite_pipeline.py DELETED Viewed

@@ -1,493 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import math
-import numpy
-import torch
-import inspect
-import warnings
-from PIL import Image
-from einops import rearrange
-import torch.nn.functional as F
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers.configuration_utils import FrozenDict
-from diffusers.image_processor import VaeImageProcessor
-from typing import Any, Callable, Dict, List, Optional, Union
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers import DDPMScheduler, EulerAncestralDiscreteScheduler, ImagePipelineOutput
-from diffusers.loaders import (
-    FromSingleFileMixin,
-    LoraLoaderMixin,
-    TextualInversionLoaderMixin
-)
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection
-)
-from diffusers.models.attention_processor import (
-    Attention,
-    AttnProcessor,
-    XFormersAttnProcessor,
-    AttnProcessor2_0
-)
-from .utils import to_rgb_image, white_out_background, recenter_img
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from here import Hunyuan3d_MVD_Qing_Pipeline
-        >>> pipe = Hunyuan3d_MVD_Qing_Pipeline.from_pretrained(
-        ...     "Tencent-Hunyuan-3D/MVD-Qing", torch_dtype=torch.float16
-        ... )
-        >>> pipe.to("cuda")
-        >>> img = Image.open("demo.png")
-        >>> res_img = pipe(img).images[0]
-"""
-def unscale_latents(latents): return latents / 0.75 + 0.22
-def unscale_image  (image  ): return   image / 0.50 * 0.80
-def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
-    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
-    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
-    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
-    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
-    return noise_cfg
-class ReferenceOnlyAttnProc(torch.nn.Module):
-    # reference attention
-    def __init__(self, chained_proc, enabled=False, name=None):
-        super().__init__()
-        self.enabled = enabled
-        self.chained_proc = chained_proc
-        self.name = name
-    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, mode="w", ref_dict=None):
-        if encoder_hidden_states is None: encoder_hidden_states = hidden_states
-        if self.enabled:
-            if mode == 'w':
-                ref_dict[self.name] = encoder_hidden_states
-            elif mode == 'r':
-                encoder_hidden_states = torch.cat([encoder_hidden_states, ref_dict.pop(self.name)], dim=1)
-        res = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask)
-        return res
-# class RowWiseAttnProcessor2_0:
-#     def __call__(self, attn,
-#                  hidden_states,
-#                  encoder_hidden_states=None,
-#                  attention_mask=None,
-#                  temb=None,
-#                  num_views=6,
-#                  *args,
-#                  **kwargs):
-#         residual = hidden_states
-#         if attn.spatial_norm is not None: hidden_states = attn.spatial_norm(hidden_states, temb)
-#         input_ndim = hidden_states.ndim
-#         if input_ndim == 4:
-#             batch_size, channel, height, width = hidden_states.shape
-#             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-#         if encoder_hidden_states is None:
-#             batch_size, sequence_length, _ = hidden_states.shape
-#         else:
-#             batch_size, sequence_length, _ = encoder_hidden_states.shape
-#         if attention_mask is not None:
-#             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-#             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-#         if attn.group_norm is not None: hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-#         query = attn.to_q(hidden_states)
-#         if encoder_hidden_states is None: encoder_hidden_states = hidden_states
-#         elif attn.norm_cross: encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-#         # encoder_hidden_states [B, 6hw+hw, C] if ref att
-#         key = attn.to_k(encoder_hidden_states) # [B, Vhw+hw, C]
-#         value = attn.to_v(encoder_hidden_states) # [B, Vhw+hw, C]
-#         mv_flag = hidden_states.shape[1] < encoder_hidden_states.shape[1] and encoder_hidden_states.shape[1] != 77
-#         if mv_flag:
-#             target_size = int(math.sqrt(hidden_states.shape[1] // num_views))
-#             assert target_size ** 2 * num_views == hidden_states.shape[1]
-#             gen_key = key[:, :num_views*target_size*target_size, :]
-#             ref_key = key[:, num_views*target_size*target_size:, :]
-#             gen_value = value[:, :num_views*target_size*target_size, :]
-#             ref_value = value[:, num_views*target_size*target_size:, :]
-#             # rowwise attention
-#             query, gen_key, gen_value = \
-#                     rearrange(    query, "b (v1 h v2 w) c -> (b h) (v1 v2 w) c",
-#                               v1=num_views//2, v2=2, h=target_size, w=target_size), \
-#                     rearrange(  gen_key, "b (v1 h v2 w) c -> (b h) (v1 v2 w) c",
-#                               v1=num_views//2, v2=2, h=target_size, w=target_size), \
-#                     rearrange(gen_value, "b (v1 h v2 w) c -> (b h) (v1 v2 w) c",
-#                               v1=num_views//2, v2=2, h=target_size, w=target_size)
-#             inner_dim = key.shape[-1]
-#             ref_size = int(math.sqrt(ref_key.shape[1]))
-#             ref_key_expanded = ref_key.view(batch_size, 1, ref_size * ref_size, inner_dim)
-#             ref_key_expanded = ref_key_expanded.expand(-1, target_size, -1, -1).contiguous()
-#             ref_key_expanded = ref_key_expanded.view(batch_size * target_size, ref_size * ref_size, inner_dim)
-#             key   = torch.cat([  gen_key,   ref_key_expanded], dim=1)
-#             ref_value_expanded = ref_value.view(batch_size, 1, ref_size * ref_size, inner_dim)
-#             ref_value_expanded = ref_value_expanded.expand(-1, target_size, -1, -1).contiguous()
-#             ref_value_expanded = ref_value_expanded.view(batch_size * target_size, ref_size * ref_size, inner_dim)
-#             value = torch.cat([gen_value, ref_value_expanded], dim=1)
-#             h = target_size
-#         else:
-#             target_size = int(math.sqrt(hidden_states.shape[1]))
-#             h = 1
-#             num_views = 1
-#         inner_dim = key.shape[-1]
-#         head_dim = inner_dim // attn.heads
-#         query = query.view(batch_size * h, -1, attn.heads, head_dim).transpose(1, 2)
-#         key   =   key.view(batch_size * h, -1, attn.heads, head_dim).transpose(1, 2)
-#         value = value.view(batch_size * h, -1, attn.heads, head_dim).transpose(1, 2)
-#         hidden_states = F.scaled_dot_product_attention(query, key, value,
-#                                                        attn_mask=attention_mask,
-#                                                        dropout_p=0.0,
-#                                                        is_causal=False)
-#         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size * h,
-#                                                               -1,
-#                                                               attn.heads * head_dim).to(query.dtype)
-#         hidden_states = attn.to_out[1](attn.to_out[0](hidden_states))
-#         if mv_flag: hidden_states = rearrange(hidden_states, "(b h) (v1 v2 w) c -> b (v1 h v2 w) c",
-#                                               b=batch_size, v1=num_views//2,
-#                                               v2=2, h=target_size, w=target_size)
-#         if input_ndim == 4:
-#             hidden_states = hidden_states.transpose(-1, -2)
-#             hidden_states = hidden_states.reshape(batch_size,
-#                                                   channel,
-#                                                   target_size,
-#                                                   target_size)
-#         if attn.residual_connection: hidden_states = hidden_states + residual
-#         hidden_states = hidden_states / attn.rescale_output_factor
-#         return hidden_states
-class RefOnlyNoisedUNet(torch.nn.Module):
-    def __init__(self, unet, train_sched, val_sched):
-        super().__init__()
-        self.unet = unet
-        self.train_sched = train_sched
-        self.val_sched = val_sched
-        unet_lora_attn_procs = dict()
-        for name, _ in unet.attn_processors.items():
-            unet_lora_attn_procs[name] = ReferenceOnlyAttnProc(AttnProcessor2_0(),
-                                                           enabled=name.endswith("attn1.processor"),
-                                                           name=name)
-        unet.set_attn_processor(unet_lora_attn_procs)
-    def __getattr__(self, name: str):
-        try:
-            return super().__getattr__(name)
-        except AttributeError:
-            return getattr(self.unet, name)
-    def forward(self, sample, timestep, encoder_hidden_states, *args, cross_attention_kwargs, **kwargs):
-        cond_lat = cross_attention_kwargs['cond_lat']
-        noise = torch.randn_like(cond_lat)
-        if self.training:
-            noisy_cond_lat = self.train_sched.add_noise(cond_lat, noise, timestep)
-            noisy_cond_lat = self.train_sched.scale_model_input(noisy_cond_lat, timestep)
-        else:
-            noisy_cond_lat = self.val_sched.add_noise(cond_lat, noise, timestep.reshape(-1))
-            noisy_cond_lat = self.val_sched.scale_model_input(noisy_cond_lat, timestep.reshape(-1))
-        ref_dict = {}
-        self.unet(noisy_cond_lat,
-                  timestep,
-                  encoder_hidden_states,
-                  *args,
-                  cross_attention_kwargs=dict(mode="w", ref_dict=ref_dict),
-                  **kwargs)
-        return  self.unet(sample,
-                          timestep,
-                          encoder_hidden_states,
-                          *args,
-                          cross_attention_kwargs=dict(mode="r", ref_dict=ref_dict),
-                          **kwargs)
-class Hunyuan3d_MVD_Lite_Pipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin):
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        vision_encoder: CLIPVisionModelWithProjection,
-        feature_extractor_clip: CLIPImageProcessor,
-        feature_extractor_vae: CLIPImageProcessor,
-        ramping_coefficients: Optional[list] = None,
-        safety_checker=None,
-    ):
-        DiffusionPipeline.__init__(self)
-        self.register_modules(
-            vae=vae,
-            unet=unet,
-            tokenizer=tokenizer,
-            scheduler=scheduler,
-            text_encoder=text_encoder,
-            vision_encoder=vision_encoder,
-            feature_extractor_vae=feature_extractor_vae,
-            feature_extractor_clip=feature_extractor_clip)
-        '''
-            rewrite the stable diffusion pipeline
-            vae: vae
-            unet: unet
-            tokenizer: tokenizer
-            scheduler: scheduler
-            text_encoder: text_encoder
-            vision_encoder: vision_encoder
-            feature_extractor_vae: feature_extractor_vae
-            feature_extractor_clip: feature_extractor_clip
-        '''
-        self.register_to_config(ramping_coefficients=ramping_coefficients)
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-    def prepare_extra_step_kwargs(self, generator, eta):
-        extra_step_kwargs = {}
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_eta: extra_step_kwargs["eta"] = eta
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator: extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-    @torch.no_grad()
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-    ):
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        if prompt_embeds is None:
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)[0]
-        if self.text_encoder is not None:
-            prompt_embeds_dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            prompt_embeds_dtype = self.unet.dtype
-        else:
-            prompt_embeds_dtype = prompt_embeds.dtype
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None: uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt): raise TypeError()
-            elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt): raise ValueError()
-            else: uncond_tokens = negative_prompt
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(uncond_tokens,
-                                          padding="max_length",
-                                          max_length=max_length,
-                                          truncation=True,
-                                          return_tensors="pt")
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-            negative_prompt_embeds = self.text_encoder(uncond_input.input_ids.to(device), attention_mask=attention_mask)
-            negative_prompt_embeds = negative_prompt_embeds[0]
-        if do_classifier_free_guidance:
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-        return prompt_embeds
-    @torch.no_grad()
-    def encode_condition_image(self, image: torch.Tensor): return self.vae.encode(image).latent_dist.sample()
-    @torch.no_grad()
-    def __call__(self, image=None,
-                 width=640,
-                 height=960,
-                 num_inference_steps=75,
-                 return_dict=True,
-                 generator=None,
-                 **kwargs):
-        batch_size = 1
-        num_images_per_prompt = 1
-        output_type = 'pil'
-        do_classifier_free_guidance = True
-        guidance_rescale = 0.
-        if isinstance(self.unet, UNet2DConditionModel):
-            self.unet = RefOnlyNoisedUNet(self.unet, None, self.scheduler).eval()
-        cond_image = recenter_img(image)
-        cond_image = to_rgb_image(image)
-        image = cond_image
-        image_1 = self.feature_extractor_vae(images=image, return_tensors="pt").pixel_values
-        image_2 = self.feature_extractor_clip(images=image, return_tensors="pt").pixel_values
-        image_1 = image_1.to(device=self.vae.device, dtype=self.vae.dtype)
-        image_2 = image_2.to(device=self.vae.device, dtype=self.vae.dtype)
-        cond_lat = self.encode_condition_image(image_1)
-        negative_lat = self.encode_condition_image(torch.zeros_like(image_1))
-        cond_lat = torch.cat([negative_lat, cond_lat])
-        cross_attention_kwargs = dict(cond_lat=cond_lat)
-        global_embeds = self.vision_encoder(image_2, output_hidden_states=False).image_embeds.unsqueeze(-2)
-        encoder_hidden_states = self._encode_prompt('', self.device, num_images_per_prompt, False)
-        ramp = global_embeds.new_tensor(self.config.ramping_coefficients).unsqueeze(-1)
-        prompt_embeds = torch.cat([encoder_hidden_states, encoder_hidden_states + global_embeds * ramp])
-        device = self._execution_device
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(batch_size * num_images_per_prompt,
-                                       num_channels_latents,
-                                       height,
-                                       width,
-                                       prompt_embeds.dtype,
-                                       device,
-                                       generator,
-                                       None)
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0)
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        # set adaptive cfg
-        # the image order is:
-        #    [0, 60,
-        #     120, 180,
-        #     240, 300]
-        # the cfg is set as 3, 2.5, 2, 1.5
-        tmp_guidance_scale = torch.ones_like(latents)
-        tmp_guidance_scale[:, :, :40, :40] = 3
-        tmp_guidance_scale[:, :, :40, 40:] =  2.5
-        tmp_guidance_scale[:, :, 40:80, :40] =  2
-        tmp_guidance_scale[:, :, 40:80, 40:] =  1.5
-        tmp_guidance_scale[:, :, 80:120, :40] =  2
-        tmp_guidance_scale[:, :, 80:120, 40:] =  2.5
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                noise_pred = self.unet(latent_model_input, t,
-                                encoder_hidden_states=prompt_embeds,
-                                cross_attention_kwargs=cross_attention_kwargs,
-                                return_dict=False)[0]
-                adaptive_guidance_scale = (2 + 16 * (t / 1000) ** 5) / 3
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + \
-                        tmp_guidance_scale * adaptive_guidance_scale * \
-                        (noise_pred_text - noise_pred_uncond)
-                if do_classifier_free_guidance and guidance_rescale > 0.0:
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-                if i==len(timesteps)-1 or ((i+1)>num_warmup_steps and (i+1)%self.scheduler.order==0):
-                    progress_bar.update()
-        latents = unscale_latents(latents)
-        image = unscale_image(self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0])
-        image = self.image_processor.postprocess(image, output_type='pil')[0]
-        image = [image, cond_image]
-        return ImagePipelineOutput(images=image) if return_dict else (image,)

mvd/hunyuan3d_mvd_std_pipeline.py DELETED Viewed

@@ -1,471 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import inspect
-from typing import Any, Dict, Optional
-from typing import Any, Dict, List, Optional, Tuple, Union
-import os
-import torch
-import numpy as np
-from PIL import Image
-import diffusers
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.models.attention_processor import (
-    Attention,
-    AttnProcessor,
-    XFormersAttnProcessor,
-    AttnProcessor2_0
-)
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    DiffusionPipeline,
-    EulerAncestralDiscreteScheduler,
-    UNet2DConditionModel,
-    ImagePipelineOutput
-)
-import transformers
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-    CLIPTextModelWithProjection
-)
-from .utils import to_rgb_image, white_out_background, recenter_img
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import Hunyuan3d_MVD_XL_Pipeline
-        >>> pipe = Hunyuan3d_MVD_XL_Pipeline.from_pretrained(
-        ...     "Tencent-Hunyuan-3D/MVD-XL", torch_dtype=torch.float16
-        ... )
-        >>> pipe.to("cuda")
-        >>> img = Image.open("demo.png")
-        >>> res_img = pipe(img).images[0]
-        ```
-"""
-def scale_latents(latents):   return (latents - 0.22) * 0.75
-def unscale_latents(latents): return (latents / 0.75) + 0.22
-def scale_image(image):       return (image - 0.5) / 0.5
-def scale_image_2(image):     return (image * 0.5) / 0.8
-def unscale_image(image):     return (image * 0.5) + 0.5
-def unscale_image_2(image):   return (image * 0.8) / 0.5
-class ReferenceOnlyAttnProc(torch.nn.Module):
-    def __init__(self, chained_proc, enabled=False, name=None):
-        super().__init__()
-        self.enabled = enabled
-        self.chained_proc = chained_proc
-        self.name = name
-    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, mode="w", ref_dict=None):
-        encoder_hidden_states = hidden_states if encoder_hidden_states is None else encoder_hidden_states
-        if self.enabled:
-            if   mode == 'w': ref_dict[self.name]   = encoder_hidden_states
-            elif mode == 'r': encoder_hidden_states = torch.cat([encoder_hidden_states, ref_dict.pop(self.name)], dim=1)
-            else:             raise Exception(f"mode should not be {mode}")
-        return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask)
-class RefOnlyNoisedUNet(torch.nn.Module):
-    def __init__(self, unet, scheduler) -> None:
-        super().__init__()
-        self.unet = unet
-        self.scheduler = scheduler
-        unet_attn_procs = dict()
-        for name, _ in unet.attn_processors.items():
-            if torch.__version__ >= '2.0': default_attn_proc = AttnProcessor2_0()
-            elif is_xformers_available():  default_attn_proc = XFormersAttnProcessor()
-            else:                          default_attn_proc = AttnProcessor()
-            unet_attn_procs[name] = ReferenceOnlyAttnProc(
-                default_attn_proc, enabled=name.endswith("attn1.processor"), name=name
-            )
-        unet.set_attn_processor(unet_attn_procs)
-    def __getattr__(self, name: str):
-        try:
-            return super().__getattr__(name)
-        except AttributeError:
-            return getattr(self.unet, name)
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        class_labels: Optional[torch.Tensor] = None,
-        down_block_res_samples: Optional[Tuple[torch.Tensor]] = None,
-        mid_block_res_sample: Optional[Tuple[torch.Tensor]] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        return_dict: bool = True,
-        **kwargs
-    ):
-        dtype = self.unet.dtype
-        # cond_lat add same level noise
-        cond_lat = cross_attention_kwargs['cond_lat']
-        noise = torch.randn_like(cond_lat)
-        noisy_cond_lat = self.scheduler.add_noise(cond_lat, noise, timestep.reshape(-1))
-        noisy_cond_lat = self.scheduler.scale_model_input(noisy_cond_lat, timestep.reshape(-1))
-        ref_dict = {}
-        _ = self.unet(
-            noisy_cond_lat,
-            timestep,
-            encoder_hidden_states = encoder_hidden_states,
-            class_labels = class_labels,
-            cross_attention_kwargs = dict(mode="w", ref_dict=ref_dict),
-            added_cond_kwargs = added_cond_kwargs,
-            return_dict = return_dict,
-            **kwargs
-        )
-        res = self.unet(
-            sample,
-            timestep,
-            encoder_hidden_states,
-            class_labels=class_labels,
-            cross_attention_kwargs = dict(mode="r", ref_dict=ref_dict),
-            down_block_additional_residuals = [
-                sample.to(dtype=dtype) for sample in down_block_res_samples
-            ] if down_block_res_samples is not None else None,
-            mid_block_additional_residual = (
-                mid_block_res_sample.to(dtype=dtype)
-                if mid_block_res_sample is not None else None),
-            added_cond_kwargs = added_cond_kwargs,
-            return_dict = return_dict,
-            **kwargs
-        )
-        return res
-class HunYuan3D_MVD_Std_Pipeline(diffusers.DiffusionPipeline):
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        feature_extractor_vae: CLIPImageProcessor,
-        vision_processor: CLIPImageProcessor,
-        vision_encoder: CLIPVisionModelWithProjection,
-        vision_encoder_2: CLIPVisionModelWithProjection,
-        ramping_coefficients: Optional[list] = None,
-        add_watermarker: Optional[bool] = None,
-        safety_checker = None,
-    ):
-        DiffusionPipeline.__init__(self)
-        self.register_modules(
-            vae=vae, unet=unet, scheduler=scheduler, safety_checker=None, feature_extractor_vae=feature_extractor_vae,
-            vision_processor=vision_processor, vision_encoder=vision_encoder, vision_encoder_2=vision_encoder_2,
-        )
-        self.register_to_config( ramping_coefficients = ramping_coefficients)
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-        self.default_sample_size = self.unet.config.sample_size
-        self.watermark = None
-        self.prepare_init = False
-    def prepare(self):
-        assert isinstance(self.unet, UNet2DConditionModel), "unet should be UNet2DConditionModel"
-        self.unet = RefOnlyNoisedUNet(self.unet, self.scheduler).eval()
-        self.prepare_init = True
-    def encode_image(self, image: torch.Tensor, scale_factor: bool = False):
-        latent = self.vae.encode(image).latent_dist.sample()
-        return (latent * self.vae.config.scaling_factor) if scale_factor else latent
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            int(height) // self.vae_scale_factor,
-            int(width) // self.vae_scale_factor,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-    def _get_add_time_ids(
-        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
-    ):
-        add_time_ids = list(original_size + crops_coords_top_left + target_size)
-        passed_add_embed_dim = (
-            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
-        )
-        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
-        if expected_add_embed_dim != passed_add_embed_dim:
-            raise ValueError(
-                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, " \
-                f"but a vector of {passed_add_embed_dim} was created. The model has an incorrect config." \
-                f" Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
-            )
-        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
-        return add_time_ids
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:  extra_step_kwargs["eta"] = eta
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator: extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-    @property
-    def interrupt(self):
-        return self._interrupt
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
-    @torch.no_grad()
-    def __call__(
-        self,
-        image: Image.Image = None,
-        guidance_scale = 2.0,
-        output_type: Optional[str] = "pil",
-        num_inference_steps: int = 50,
-        return_dict: bool = True,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        latent: torch.Tensor = None,
-        guidance_curve = None,
-        **kwargs
-    ):
-        if not self.prepare_init:
-            self.prepare()
-        here = dict(device=self.vae.device, dtype=self.vae.dtype)
-        batch_size = 1
-        num_images_per_prompt = 1
-        width, height = 512 * 2,  512 * 3
-        target_size = original_size = (height, width)
-        self._guidance_scale = guidance_scale
-        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False
-        device = self._execution_device
-        # Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-        # Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            self.vae.dtype,
-            device,
-            generator,
-            latents=latent,
-        )
-        # Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        # Prepare added time ids & embeddings
-        text_encoder_projection_dim = 1280
-        add_time_ids = self._get_add_time_ids(
-            original_size,
-            crops_coords_top_left,
-            target_size,
-            dtype=self.vae.dtype,
-            text_encoder_projection_dim=text_encoder_projection_dim,
-        )
-        negative_add_time_ids = add_time_ids
-        # hw: preprocess
-        cond_image = recenter_img(image)
-        cond_image = to_rgb_image(image)
-        image_vae = self.feature_extractor_vae(images=cond_image, return_tensors="pt").pixel_values.to(**here)
-        image_clip = self.vision_processor(images=cond_image, return_tensors="pt").pixel_values.to(**here)
-        # hw: get cond_lat from cond_img using vae
-        cond_lat = self.encode_image(image_vae, scale_factor=False)
-        negative_lat = self.encode_image(torch.zeros_like(image_vae), scale_factor=False)
-        cond_lat = torch.cat([negative_lat, cond_lat])
-        # hw: get visual global embedding using clip
-        global_embeds_1 = self.vision_encoder(image_clip, output_hidden_states=False).image_embeds.unsqueeze(-2)
-        global_embeds_2 = self.vision_encoder_2(image_clip, output_hidden_states=False).image_embeds.unsqueeze(-2)
-        global_embeds = torch.concat([global_embeds_1, global_embeds_2], dim=-1)
-        ramp = global_embeds.new_tensor(self.config.ramping_coefficients).unsqueeze(-1)
-        prompt_embeds = self.uc_text_emb.to(**here)
-        pooled_prompt_embeds =  self.uc_text_emb_2.to(**here)
-        prompt_embeds = prompt_embeds + global_embeds * ramp
-        add_text_embeds = pooled_prompt_embeds
-        if self.do_classifier_free_guidance:
-            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
-            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
-        prompt_embeds = prompt_embeds.to(device)
-        add_text_embeds = add_text_embeds.to(device)
-        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
-        # Denoising loop
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        timestep_cond = None
-        self._num_timesteps = len(timesteps)
-        if guidance_curve is None:
-            guidance_curve = lambda t: guidance_scale
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                # predict the noise residual
-                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    timestep_cond=timestep_cond,
-                    cross_attention_kwargs=dict(cond_lat=cond_lat),
-                    added_cond_kwargs=added_cond_kwargs,
-                    return_dict=False,
-                )[0]
-                # perform guidance
-                # cur_guidance_scale = self.guidance_scale
-                cur_guidance_scale = guidance_curve(t)  # 1.5 + 2.5 * ((t/1000)**2)
-                if self.do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + cur_guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    # cur_guidance_scale_topleft = (cur_guidance_scale - 1.0) * 4 + 1.0
-                    # noise_pred_top_left = noise_pred_uncond +
-                    #    cur_guidance_scale_topleft * (noise_pred_text - noise_pred_uncond)
-                    # _, _, h, w = noise_pred.shape
-                    # noise_pred[:, :, :h//3, :w//2] = noise_pred_top_left[:, :, :h//3, :w//2]
-                # compute the previous noisy sample x_t -> x_t-1
-                latents_dtype = latents.dtype
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-        latents = unscale_latents(latents)
-        if output_type=="latent":
-            image = latents
-        else:
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-            image = unscale_image(unscale_image_2(image)).clamp(0, 1)
-            image = [
-                Image.fromarray((image[0]*255+0.5).clamp_(0, 255).permute(1, 2, 0).cpu().numpy().astype("uint8")),
-                # self.image_processor.postprocess(image, output_type=output_type)[0],
-                cond_image.resize((512, 512))
-            ]
-        if not return_dict: return (image,)
-        return ImagePipelineOutput(images=image)
-    def save_pretrained(self, save_directory):
-        # uc_text_emb.pt and uc_text_emb_2.pt are inferenced and saved in advance
-        super().save_pretrained(save_directory)
-        torch.save(self.uc_text_emb, os.path.join(save_directory, "uc_text_emb.pt"))
-        torch.save(self.uc_text_emb_2, os.path.join(save_directory, "uc_text_emb_2.pt"))
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        # uc_text_emb.pt and uc_text_emb_2.pt are inferenced and saved in advance
-        pipeline = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        pipeline.uc_text_emb = torch.load(os.path.join(pretrained_model_name_or_path, "uc_text_emb.pt"))
-        pipeline.uc_text_emb_2 = torch.load(os.path.join(pretrained_model_name_or_path, "uc_text_emb_2.pt"))
-        return pipeline

mvd/utils.py DELETED Viewed

@@ -1,85 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import numpy as np
-from PIL import Image
-def to_rgb_image(maybe_rgba: Image.Image):
-    '''
-        convert a PIL.Image to rgb mode with white background
-        maybe_rgba: PIL.Image
-        return: PIL.Image
-    '''
-    if maybe_rgba.mode == 'RGB':
-        return maybe_rgba
-    elif maybe_rgba.mode == 'RGBA':
-        rgba = maybe_rgba
-        img = np.random.randint(255, 256, size=[rgba.size[1], rgba.size[0], 3], dtype=np.uint8)
-        img = Image.fromarray(img, 'RGB')
-        img.paste(rgba, mask=rgba.getchannel('A'))
-        return img
-    else:
-        raise ValueError("Unsupported image type.", maybe_rgba.mode)
-def white_out_background(pil_img, is_gray_fg=True):
-    data = pil_img.getdata()
-    new_data = []
-    #  convert fore-ground white to gray
-    for r, g, b, a in data:
-        if a < 16:
-            new_data.append((255, 255, 255, 0))  # back-ground to be black
-        else:
-            is_white = is_gray_fg and (r>235) and (g>235) and (b>235)
-            new_r = 235 if is_white else r
-            new_g = 235 if is_white else g
-            new_b = 235 if is_white else b
-            new_data.append((new_r, new_g, new_b, a))
-    pil_img.putdata(new_data)
-    return pil_img
-def recenter_img(img, size=512, color=(255,255,255)):
-    img = white_out_background(img)
-    mask = np.array(img)[..., 3]
-    image = np.array(img)[..., :3]
-    H, W, C = image.shape
-    coords = np.nonzero(mask)
-    x_min, x_max = coords[0].min(), coords[0].max()
-    y_min, y_max = coords[1].min(), coords[1].max()
-    h = x_max - x_min
-    w = y_max - y_min
-    if h == 0 or w == 0: raise ValueError
-    roi = image[x_min:x_max, y_min:y_max]
-    border_ratio = 0.15 # 0.2
-    pad_h = int(h * border_ratio)
-    pad_w = int(w * border_ratio)
-    result_tmp = np.full((h + pad_h, w + pad_w, C), color, dtype=np.uint8)
-    result_tmp[pad_h // 2: pad_h // 2 + h, pad_w // 2: pad_w // 2 + w] = roi
-    cur_h, cur_w = result_tmp.shape[:2]
-    side = max(cur_h, cur_w)
-    result = np.full((side, side, C), color, dtype=np.uint8)
-    result[(side-cur_h)//2:(side-cur_h)//2+cur_h, (side-cur_w)//2:(side - cur_w)//2+cur_w,:] = result_tmp
-    result = Image.fromarray(result)
-    return result.resize((size, size), Image.LANCZOS) if size else result

requirements.txt DELETED Viewed

@@ -1,22 +0,0 @@
---find-links https://download.pytorch.org/whl/cu118
-torch==2.2.0
-torchvision==0.17.0
-diffusers
-transformers
-rembg
-tqdm
-omegaconf
-matplotlib
-opencv-python
-imageio
-jaxtyping
-einops
-SentencePiece
-accelerate
-trimesh
-PyMCubes
-xatlas
-libigl
-git+https://github.com/facebookresearch/pytorch3d@stable
-git+https://github.com/NVlabs/nvdiffrast
-open3d

scripts/image_to_3d.sh DELETED Viewed

@@ -1,8 +0,0 @@
-# image to 3d
-python main.py \
-    --image_prompt ./demos/example_000.png \
-    --save_folder ./outputs/test/ \
-    --max_faces_num 90000 \
-    --do_texture \
-    --do_render

scripts/image_to_3d_demo.sh DELETED Viewed

@@ -1,8 +0,0 @@
-# image to 3d
-python main.py \
-    --image_prompt ./demos/example_000.png \
-    --save_folder ./outputs/test/ \
-    --max_faces_num 90000 \
-    --do_texture_mapping \
-    --do_render

scripts/image_to_3d_fast.sh DELETED Viewed

@@ -1,6 +0,0 @@
-# image to 3d fast
-python main.py \
-    --image_prompt ./demos/example_000.png \
-    --save_folder ./outputs/test/ \
-    --max_faces_num 10000 \
-    --use_lite

scripts/image_to_3d_fast_demo.sh DELETED Viewed

@@ -1,6 +0,0 @@
-# image to 3d fast
-python main.py \
-    --image_prompt ./demos/example_000.png \
-    --save_folder ./outputs/test/ \
-    --max_faces_num 10000 \
-    --use_lite

scripts/text_to_3d.sh DELETED Viewed

@@ -1,7 +0,0 @@
-# text to 3d fast
-python main.py \
-    --text_prompt "a lovely cat" \
-    --save_folder ./outputs/test/ \
-    --max_faces_num 90000 \
-    --do_texture \
-    --do_render

scripts/text_to_3d_demo.sh DELETED Viewed

@@ -1,7 +0,0 @@
-# text to 3d fast
-python main.py \
-    --text_prompt "a lovely rabbit" \
-    --save_folder ./outputs/test/ \
-    --max_faces_num 90000 \
-    --do_texture_mapping \
-    --do_render

scripts/text_to_3d_fast.sh DELETED Viewed

@@ -1,6 +0,0 @@
-# text to 3d fast
-python main.py \
-    --text_prompt "一个广式茶杯" \
-    --save_folder ./outputs/test/ \
-    --max_faces_num 10000 \
-    --use_lite

scripts/text_to_3d_fast_demo.sh DELETED Viewed

@@ -1,6 +0,0 @@
-# text to 3d fast
-python main.py \
-    --text_prompt "一个广式茶杯" \
-    --save_folder ./outputs/test/ \
-    --max_faces_num 10000 \
-    --use_lite

svrm/.DS_Store DELETED Viewed

Binary file (6.15 kB)

svrm/configs/2024-10-24T22-36-18-project.yaml DELETED Viewed

@@ -1,32 +0,0 @@
-model:
-  base_learning_rate: 3.0e-05
-  target: svrm.ldm.models.svrm.SVRMModel
-  params:
-    img_encoder_config:
-      target: svrm.ldm.modules.encoders.dinov2_mod.FrozenDinoV2ImageEmbedder
-      params:
-        version: dinov2_vitb14
-    img_to_triplane_config:
-      target: svrm.ldm.modules.translator.img_to_triplane.ImgToTriplaneModel
-      params:
-        pos_emb_size: 64
-        pos_emb_dim: 1024
-        cam_cond_dim: 20
-        n_heads: 16
-        d_head: 64
-        depth: 16
-        context_dim: 768
-        triplane_dim: 120
-        use_fp16: true
-        use_bf16: false
-        upsample_time: 2
-    render_config:
-      target: svrm.ldm.modules.rendering_neus.synthesizer.TriplaneSynthesizer
-      params:
-        triplane_dim: 120
-        samples_per_ray: 128

svrm/configs/svrm.yaml DELETED Viewed

@@ -1,32 +0,0 @@
-model:
-  base_learning_rate: 3.0e-05
-  target: svrm.ldm.models.svrm.SVRMModel
-  params:
-    img_encoder_config:
-      target: svrm.ldm.modules.encoders.dinov2_mod.FrozenDinoV2ImageEmbedder
-      params:
-        version: dinov2_vitb14
-    img_to_triplane_config:
-      target: svrm.ldm.modules.translator.img_to_triplane.ImgToTriplaneModel
-      params:
-        pos_emb_size: 64
-        pos_emb_dim: 1024
-        cam_cond_dim: 20
-        n_heads: 16
-        d_head: 64
-        depth: 16
-        context_dim: 768
-        triplane_dim: 120
-        use_fp16: true
-        use_bf16: false
-        upsample_time: 2
-    render_config:
-      target: svrm.ldm.modules.rendering_neus.synthesizer.TriplaneSynthesizer
-      params:
-        triplane_dim: 120
-        samples_per_ray: 128

svrm/ldm/.DS_Store DELETED Viewed

Binary file (6.15 kB)

svrm/ldm/models/svrm.py DELETED Viewed

@@ -1,263 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import os
-import time
-import math
-import cv2
-import numpy as np
-import itertools
-import shutil
-from tqdm import tqdm
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-try:
-    import trimesh
-    import mcubes
-    import xatlas
-    import open3d as o3d
-except:
-    raise "failed to import 3d libraries "
-from ..modules.rendering_neus.mesh import Mesh
-from ..modules.rendering_neus.rasterize import NVDiffRasterizerContext
-from ..utils.ops import scale_tensor
-from ..util import count_params, instantiate_from_config
-from ..vis_util import render
-def unwrap_uv(v_pos, t_pos_idx):
-    print("Using xatlas to perform UV unwrapping, may take a while ...")
-    atlas = xatlas.Atlas()
-    atlas.add_mesh(v_pos, t_pos_idx)
-    atlas.generate(xatlas.ChartOptions(), xatlas.PackOptions())
-    _, indices, uvs = atlas.get_mesh(0)
-    indices = indices.astype(np.int64, casting="same_kind")
-    return uvs, indices
-def uv_padding(image, hole_mask, uv_padding_size = 2):
-    return cv2.inpaint(
-        (image.detach().cpu().numpy() * 255).astype(np.uint8),
-        (hole_mask.detach().cpu().numpy() * 255).astype(np.uint8),
-        uv_padding_size,
-        cv2.INPAINT_TELEA
-    )
-def refine_mesh(vtx_refine, faces_refine):
-    mesh = o3d.geometry.TriangleMesh(
-        vertices=o3d.utility.Vector3dVector(vtx_refine),
-        triangles=o3d.utility.Vector3iVector(faces_refine))
-    mesh = mesh.remove_unreferenced_vertices()
-    mesh = mesh.remove_duplicated_triangles()
-    mesh = mesh.remove_duplicated_vertices()
-    voxel_size = max(mesh.get_max_bound() - mesh.get_min_bound())
-    mesh = mesh.simplify_vertex_clustering(
-        voxel_size=0.007, # 0.005
-        contraction=o3d.geometry.SimplificationContraction.Average)
-    mesh = mesh.filter_smooth_simple(number_of_iterations=2)
-    vtx_refine = np.asarray(mesh.vertices).astype(np.float32)
-    faces_refine = np.asarray(mesh.triangles)
-    return vtx_refine, faces_refine, mesh
-class SVRMModel(torch.nn.Module):
-    def __init__(
-        self,
-        img_encoder_config,
-        img_to_triplane_config,
-        render_config,
-        device = "cuda:0",
-        **kwargs
-    ):
-        super().__init__()
-        self.img_encoder = instantiate_from_config(img_encoder_config).half()
-        self.img_to_triplane_decoder = instantiate_from_config(img_to_triplane_config).half()
-        self.render = instantiate_from_config(render_config).half()
-        self.device = device
-        count_params(self, verbose=True)
-    @torch.no_grad()
-    def export_mesh_with_uv(
-        self,
-        data,
-        mesh_size: int = 384,
-        ctx = None,
-        context_type = 'cuda',
-        texture_res = 1024,
-        target_face_count = 10000,
-        do_texture_mapping = True,
-        out_dir = 'outputs/test'
-    ):
-        """
-        color_type: 0 for ray texture, 1 for vertices texture
-        """
-        st = time.time()
-        here = {'device': self.device, 'dtype': torch.float16}
-        input_view_image = data["input_view"].to(**here)    # [b, m, c, h, w]
-        input_view_cam = data["input_view_cam"].to(**here)  # [b, m, 20]
-        batch_size, input_view_num, *_ = input_view_image.shape
-        assert batch_size == 1, "batch size should be 1"
-        input_view_image = rearrange(input_view_image, 'b m c h w -> (b m) c h w')
-        input_view_cam = rearrange(input_view_cam, 'b m d -> (b m) d')
-        input_view_feat = self.img_encoder(input_view_image, input_view_cam)
-        input_view_feat = rearrange(input_view_feat, '(b m) l d -> b (l m) d', m=input_view_num)
-        # -- decoder
-        torch.cuda.empty_cache()
-        triplane_gen = self.img_to_triplane_decoder(input_view_feat)  # [b, 3, tri_dim, h, w]
-        del input_view_feat
-        torch.cuda.empty_cache()
-        # --- triplane nerf render
-        cur_triplane = triplane_gen[0:1]
-        aabb = torch.tensor([[-0.6, -0.6, -0.6], [0.6, 0.6, 0.6]]).unsqueeze(0).to(**here)
-        grid_out = self.render.forward_grid(planes=cur_triplane, grid_size=mesh_size, aabb=aabb)
-        print(f"=====> LRM forward time: {time.time() - st}")
-        st = time.time()
-        vtx, faces = mcubes.marching_cubes(0. - grid_out['sdf'].squeeze(0).squeeze(-1).cpu().float().numpy(), 0)
-        bbox = aabb[0].cpu().numpy()
-        vtx = vtx / (mesh_size - 1)
-        vtx = vtx * (bbox[1] - bbox[0]) + bbox[0]
-        # refine mesh
-        vtx_refine, faces_refine, mesh = refine_mesh(vtx, faces)
-        # reduce faces
-        if faces_refine.shape[0] > target_face_count:
-            print(f"reduce face: {faces_refine.shape[0]} -> {target_face_count}")
-            mesh = o3d.geometry.TriangleMesh(
-                vertices = o3d.utility.Vector3dVector(vtx_refine),
-                triangles = o3d.utility.Vector3iVector(faces_refine)
-            )
-            # Function to simplify mesh using Quadric Error Metric Decimation by Garland and Heckbert
-            mesh = mesh.simplify_quadric_decimation(target_face_count, boundary_weight=1.0)
-            mesh = Mesh(
-                v_pos = torch.from_numpy(np.asarray(mesh.vertices)).to(self.device),
-                t_pos_idx = torch.from_numpy(np.asarray(mesh.triangles)).to(self.device),
-                v_rgb = torch.from_numpy(np.asarray(mesh.vertex_colors)).to(self.device)
-            )
-            vtx_refine = mesh.v_pos.cpu().numpy()
-            faces_refine = mesh.t_pos_idx.cpu().numpy()
-        vtx_colors = self.render.forward_points(cur_triplane, torch.tensor(vtx_refine).unsqueeze(0).to(**here))
-        vtx_colors = vtx_colors['rgb'].float().squeeze(0).cpu().numpy()
-        color_ratio = 0.8 # increase brightness
-        with open(f'{out_dir}/mesh_with_colors.obj', 'w') as fid:
-            verts = vtx_refine[:, [1,2,0]]
-            for pidx, pp in enumerate(verts):
-                color = vtx_colors[pidx]
-                color = [color[0]**color_ratio, color[1]**color_ratio, color[2]**color_ratio]
-                fid.write('v %f %f %f %f %f %f\n' % (pp[0], pp[1], pp[2], color[0], color[1], color[2]))
-            for i, f in enumerate(faces_refine):
-                f1 = f + 1
-                fid.write('f %d %d %d\n' % (f1[0], f1[1], f1[2]))
-        mesh = trimesh.load_mesh(f'{out_dir}/mesh_with_colors.obj')
-        print(f"=====> generate mesh with vertex shading time: {time.time() - st}")
-        st = time.time()
-        if not do_texture_mapping:
-            shutil.copy(f'{out_dir}/mesh_with_colors.obj', f'{out_dir}/mesh.obj')
-            mesh.export(f'{out_dir}/mesh.glb', file_type='glb')
-            return None
-        ##########  export texture  ########
-        st = time.time()
-        # uv unwrap
-        vtx_tex, t_tex_idx = unwrap_uv(vtx_refine, faces_refine)
-        vtx_refine   = torch.from_numpy(vtx_refine).to(self.device)
-        faces_refine = torch.from_numpy(faces_refine).to(self.device)
-        t_tex_idx    = torch.from_numpy(t_tex_idx).to(self.device)
-        uv_clip      = torch.from_numpy(vtx_tex * 2.0 - 1.0).to(self.device)
-         # rasterize
-        ctx = NVDiffRasterizerContext(context_type, cur_triplane.device) if ctx is None else ctx
-        rast = ctx.rasterize_one(
-            torch.cat([
-                uv_clip,
-                torch.zeros_like(uv_clip[..., 0:1]),
-                torch.ones_like(uv_clip[..., 0:1])
-            ], dim=-1),
-            t_tex_idx,
-            (texture_res, texture_res)
-        )[0]
-        hole_mask = ~(rast[:, :, 3] > 0)
-        # Interpolate world space position
-        gb_pos = ctx.interpolate_one(vtx_refine, rast[None, ...], faces_refine)[0][0]
-        with torch.no_grad():
-            gb_mask_pos_scale = scale_tensor(gb_pos.unsqueeze(0).view(1, -1, 3), (-1, 1), (-1, 1))
-            tex_map = self.render.forward_points(cur_triplane, gb_mask_pos_scale)['rgb']
-            tex_map = tex_map.float().squeeze(0)  # (0, 1)
-            tex_map = tex_map.view((texture_res, texture_res, 3))
-            img = uv_padding(tex_map, hole_mask)
-            img = ((img/255.0) ** color_ratio) * 255  # increase brightness
-            img = img.clip(0, 255).astype(np.uint8)
-        verts = vtx_refine.cpu().numpy()[:, [1,2,0]]
-        faces = faces_refine.cpu().numpy()
-        with open(f'{out_dir}/texture.mtl', 'w') as fid:
-            fid.write('newmtl material_0\n')
-            fid.write("Ka 1.000 1.000 1.000\n")
-            fid.write("Kd 1.000 1.000 1.000\n")
-            fid.write("Ks 0.000 0.000 0.000\n")
-            fid.write("d 1.0\n")
-            fid.write("illum 2\n")
-            fid.write(f'map_Kd texture.png\n')
-        with open(f'{out_dir}/mesh.obj', 'w') as fid:
-            fid.write(f'mtllib texture.mtl\n')
-            for pidx, pp in enumerate(verts):
-                fid.write('v %f %f %f\n' % (pp[0], pp[1], pp[2]))
-            for pidx, pp in enumerate(vtx_tex):
-                fid.write('vt %f %f\n' % (pp[0], 1 - pp[1]))
-            fid.write('usemtl material_0\n')
-            for i, f in enumerate(faces):
-                f1 = f + 1
-                f2 = t_tex_idx[i] + 1
-                fid.write('f %d/%d %d/%d %d/%d\n' % (f1[0], f2[0], f1[1], f2[1], f1[2], f2[2],))
-        cv2.imwrite(f'{out_dir}/texture.png', img[..., [2, 1, 0]])
-        mesh = trimesh.load_mesh(f'{out_dir}/mesh.obj')
-        mesh.export(f'{out_dir}/mesh.glb', file_type='glb')

svrm/ldm/modules/attention.py DELETED Viewed

@@ -1,457 +0,0 @@
-from inspect import isfunction
-import math
-import torch
-import torch.nn.functional as F
-from torch import nn, einsum
-from einops import rearrange, repeat
-import numpy as np
-FLASH_IS_AVAILABLE = XFORMERS_IS_AVAILBLE = False
-try:
-    from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
-    FLASH_IS_AVAILABLE = True
-except:
-    try:
-        import xformers
-        import xformers.ops
-        XFORMERS_IS_AVAILBLE = True
-    except:
-        pass
-def exists(val):
-    return val is not None
-def uniq(arr):
-    return{el: True for el in arr}.keys()
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-def max_neg_value(t):
-    return -torch.finfo(t.dtype).max
-def init_(tensor):
-    dim = tensor.shape[-1]
-    std = 1 / math.sqrt(dim)
-    tensor.uniform_(-std, std)
-    return tensor
-def checkpoint(func, inputs, params, flag):
-    """
-    Evaluate a function without caching intermediate activations, allowing for
-    reduced memory at the expense of extra compute in the backward pass.
-    :param func: the function to evaluate.
-    :param inputs: the argument sequence to pass to `func`.
-    :param params: a sequence of parameters `func` depends on but does not
-                   explicitly take as arguments.
-    :param flag: if False, disable gradient checkpointing.
-    """
-    if flag:
-        args = tuple(inputs) + tuple(params)
-        return CheckpointFunction.apply(func, len(inputs), *args)
-    else:
-        return func(*inputs)
-class CheckpointFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, run_function, length, *args):
-        ctx.run_function = run_function
-        ctx.input_tensors = list(args[:length])
-        ctx.input_params = list(args[length:])
-        with torch.no_grad():
-            output_tensors = ctx.run_function(*ctx.input_tensors)
-        return output_tensors
-    @staticmethod
-    def backward(ctx, *output_grads):
-        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
-        with torch.enable_grad():
-            # Fixes a bug where the first op in run_function modifies the
-            # Tensor storage in place, which is not allowed for detach()'d
-            # Tensors.
-            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
-            output_tensors = ctx.run_function(*shallow_copies)
-        input_grads = torch.autograd.grad(
-            output_tensors,
-            ctx.input_tensors + ctx.input_params,
-            output_grads,
-            allow_unused=True,
-        )
-        del ctx.input_tensors
-        del ctx.input_params
-        del output_tensors
-        return (None, None) + input_grads
-# feedforward
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * F.gelu(gate)
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
-        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
-        )
-    def forward(self, x):
-        return self.net(x)
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-def Normalize(in_channels):
-    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-class LinearAttention(nn.Module):
-    def __init__(self, dim, heads=4, dim_head=32):
-        super().__init__()
-        self.heads = heads
-        hidden_dim = dim_head * heads
-        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
-        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
-    def forward(self, x):
-        b, c, h, w = x.shape
-        qkv = self.to_qkv(x)
-        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
-        k = k.softmax(dim=-1)
-        context = torch.einsum('bhdn,bhen->bhde', k, v)
-        out = torch.einsum('bhde,bhdn->bhen', context, q)
-        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
-        return self.to_out(out)
-class SpatialSelfAttention(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-        # compute attention
-        b,c,h,w = q.shape
-        q = rearrange(q, 'b c h w -> b (h w) c')
-        k = rearrange(k, 'b c h w -> b c (h w)')
-        w_ = torch.einsum('bij,bjk->bik', q, k)
-        w_ = w_ * (int(c)**(-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-        # attend to values
-        v = rearrange(v, 'b c h w -> b c (h w)')
-        w_ = rearrange(w_, 'b i j -> b j i')
-        h_ = torch.einsum('bij,bjk->bik', v, w_)
-        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
-        h_ = self.proj_out(h_)
-        return x+h_
-class CrossAttention(nn.Module):
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-        self.scale = dim_head ** -0.5
-        self.heads = heads
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim),
-            nn.Dropout(dropout)
-        )
-    def forward(self, x, context=None, mask=None):
-        h = self.heads
-        q = self.to_q(x)
-        context = default(context, x)
-        k = self.to_k(context)
-        v = self.to_v(context)
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
-        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
-        if exists(mask):
-            mask = rearrange(mask, 'b ... -> b (...)')
-            max_neg_value = -torch.finfo(sim.dtype).max
-            mask = repeat(mask, 'b j -> (b h) () j', h=h)
-            sim.masked_fill_(~mask, max_neg_value)
-        # attention, what we cannot get enough of
-        attn = sim.softmax(dim=-1)
-        out = einsum('b i j, b j d -> b i d', attn, v) # [b*h, n, d]
-        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
-        return self.to_out(out)
-class FlashAttention(nn.Module):
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
-        super().__init__()
-        print(f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
-              f"{heads} heads.")
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-        self.scale = dim_head ** -0.5
-        self.heads = heads
-        self.dropout = dropout
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim),
-            nn.Dropout(dropout)
-        )
-    def forward(self, x, context=None, mask=None):
-        context = default(context, x)
-        h = self.heads
-        dtype = torch.bfloat16 # torch.half
-        q = self.to_q(x).to(dtype)
-        k = self.to_k(context).to(dtype)
-        v = self.to_v(context).to(dtype)
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b n h d', h=h), (q, k, v)) # q is [b, 3079, 16, 64]
-        out = flash_attn_func(q, k, v, dropout_p=self.dropout, softmax_scale=None, causal=False, window_size=(-1, -1)) # out is same shape to q
-        out = rearrange(out, 'b n h d -> b n (h d)', h=h)
-        return self.to_out(out.float())
-class MemoryEfficientCrossAttention(nn.Module):
-    # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
-        super().__init__()
-        print(f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
-              f"{heads} heads.")
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-        self.heads = heads
-        self.dim_head = dim_head
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
-        self.attention_op: Optional[Any] = None
-    def forward(self, x, context=None, mask=None):
-        q = self.to_q(x)
-        context = default(context, x)
-        k = self.to_k(context)
-        v = self.to_v(context)
-        b, _, _ = q.shape
-        q, k, v = map(
-            lambda t: t.unsqueeze(3)
-            .reshape(b, t.shape[1], self.heads, self.dim_head)
-            .permute(0, 2, 1, 3)
-            .reshape(b * self.heads, t.shape[1], self.dim_head)
-            .contiguous(),
-            (q, k, v),
-        )
-        # actually compute the attention, what we cannot get enough of
-        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)
-        if exists(mask):
-            raise NotImplementedError
-        out = (
-            out.unsqueeze(0)
-            .reshape(b, self.heads, out.shape[1], self.dim_head)
-            .permute(0, 2, 1, 3)
-            .reshape(b, out.shape[1], self.heads * self.dim_head)
-        )
-        return self.to_out(out)
-class BasicTransformerBlock(nn.Module):
-    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
-                 disable_self_attn=False):
-        super().__init__()
-        self.disable_self_attn = disable_self_attn
-        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
-                                    context_dim=context_dim if self.disable_self_attn else None)  # is a self-attention if not self.disable_self_attn
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
-                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
-        self.norm1 = Fp32LayerNorm(dim)
-        self.norm2 = Fp32LayerNorm(dim)
-        self.norm3 = Fp32LayerNorm(dim)
-        self.checkpoint = checkpoint
-    def forward(self, x, context=None):
-        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
-    def _forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
-        x = self.attn2(self.norm2(x), context=context) + x
-        x = self.ff(self.norm3(x)) + x
-        return x
-ATTENTION_MODES = {
-    "softmax": CrossAttention,  # vanilla attention
-    "softmax-xformers": MemoryEfficientCrossAttention,
-    "softmax-flash": FlashAttention
-}
-def modulate(x, shift, scale):
-    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-class Fp32LayerNorm(nn.LayerNorm):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
-class AdaNorm(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            nn.Linear(dim, 2 * dim, bias=True)
-        )
-        self.norm = Fp32LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-    def forward(self, x, c):  # x is fp32, c is fp16
-        shift, scale = self.adaLN_modulation(c.float()).chunk(2, dim=1) # bf16
-        x = modulate(self.norm(x), shift, scale) # fp32
-        return x
-class BasicTransformerBlockLRM(nn.Module):
-    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, \
-                 checkpoint=True):
-        super().__init__()
-        attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
-        attn_mode = "softmax-flash" if FLASH_IS_AVAILABLE else attn_mode
-        assert attn_mode in ATTENTION_MODES
-        attn_cls = ATTENTION_MODES[attn_mode]
-        self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, \
-                              context_dim=context_dim) # cross-attn
-        self.attn2 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, \
-                              context_dim=None) # self-attn
-        self.norm1 = Fp32LayerNorm(dim)
-        self.norm2 = Fp32LayerNorm(dim)
-        self.norm3 = Fp32LayerNorm(dim)
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.checkpoint = checkpoint
-    def forward(self, x, context=None, cam_emb=None): # (torch.float32, torch.float32, torch.bfloat16)
-        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
-    def _forward(self, x, context=None, cam_emb=None):
-        x = self.attn1(self.norm1(x), context=context) + x  # cross-attn
-        x = self.attn2(self.norm2(x), context=None) + x # self-attn
-        x = self.ff(self.norm3(x)) + x
-        return x
-class ImgToTriplaneTransformer(nn.Module):
-    """
-    Transformer block for image-like data.
-    First, project the input (aka embedding)
-    and reshape to b, t, d.
-    Then apply standard transformer action.
-    Finally, reshape to image
-    """
-    def __init__(self, query_dim, n_heads, d_head, depth=1, dropout=0., context_dim=None, triplane_size=64):
-        super().__init__()
-        self.transformer_blocks = nn.ModuleList([
-            BasicTransformerBlockLRM(query_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
-            for d in range(depth)])
-        self.norm = Fp32LayerNorm(query_dim, eps=1e-6)
-        self.initialize_weights()
-    def initialize_weights(self):
-        # Initialize transformer layers:
-        def _basic_init(module):
-            if isinstance(module, nn.Linear):
-                torch.nn.init.xavier_uniform_(module.weight)
-                if module.bias is not None:
-                    nn.init.constant_(module.bias, 0)
-            elif isinstance(module, nn.LayerNorm):
-                if module.bias is not None:
-                    nn.init.constant_(module.bias, 0)
-                if module.weight is not None:
-                    nn.init.constant_(module.weight, 1.0)
-        self.apply(_basic_init)
-    def forward(self, x, context=None, cam_emb=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        for block in self.transformer_blocks:
-            x = block(x, context=context)
-        x = self.norm(x)
-        return x

svrm/ldm/modules/encoders/__init__.py DELETED Viewed

File without changes

svrm/ldm/modules/encoders/dinov2/__init__.py DELETED Viewed

File without changes

svrm/ldm/modules/encoders/dinov2/hub/__init__.py DELETED Viewed

File without changes

svrm/ldm/modules/encoders/dinov2/hub/backbones.py DELETED Viewed

@@ -1,156 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the Apache License, Version 2.0
-# found in the LICENSE file in the root directory of this source tree.
-from enum import Enum
-from typing import Union
-import torch
-from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
-class Weights(Enum):
-    LVD142M = "LVD142M"
-def _make_dinov2_model(
-    *,
-    arch_name: str = "vit_large",
-    img_size: int = 518,
-    patch_size: int = 14,
-    init_values: float = 1.0,
-    ffn_layer: str = "mlp",
-    block_chunks: int = 0,
-    num_register_tokens: int = 0,
-    interpolate_antialias: bool = False,
-    interpolate_offset: float = 0.1,
-    pretrained: bool = True,
-    weights: Union[Weights, str] = Weights.LVD142M,
-    **kwargs,
-):
-    from ..models import vision_transformer as vits
-    if isinstance(weights, str):
-        try:
-            weights = Weights[weights]
-        except KeyError:
-            raise AssertionError(f"Unsupported weights: {weights}")
-    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
-    vit_kwargs = dict(
-        img_size=img_size,
-        patch_size=patch_size,
-        init_values=init_values,
-        ffn_layer=ffn_layer,
-        block_chunks=block_chunks,
-        num_register_tokens=num_register_tokens,
-        interpolate_antialias=interpolate_antialias,
-        interpolate_offset=interpolate_offset,
-    )
-    vit_kwargs.update(**kwargs)
-    model = vits.__dict__[arch_name](**vit_kwargs)
-    if pretrained:
-        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
-        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
-        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
-        model.load_state_dict(state_dict, strict=True)
-    return model
-def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
-    """
-    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
-    """
-    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
-def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
-    """
-    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
-    """
-    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
-def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
-    """
-    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
-    """
-    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
-def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
-    """
-    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
-    """
-    return _make_dinov2_model(
-        arch_name="vit_giant2",
-        ffn_layer="swiglufused",
-        weights=weights,
-        pretrained=pretrained,
-        **kwargs,
-    )
-def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
-    """
-    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
-    """
-    return _make_dinov2_model(
-        arch_name="vit_small",
-        pretrained=pretrained,
-        weights=weights,
-        num_register_tokens=4,
-        interpolate_antialias=True,
-        interpolate_offset=0.0,
-        **kwargs,
-    )
-def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
-    """
-    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
-    """
-    return _make_dinov2_model(
-        arch_name="vit_base",
-        pretrained=pretrained,
-        weights=weights,
-        num_register_tokens=4,
-        interpolate_antialias=True,
-        interpolate_offset=0.0,
-        **kwargs,
-    )
-def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
-    """
-    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
-    """
-    return _make_dinov2_model(
-        arch_name="vit_large",
-        pretrained=pretrained,
-        weights=weights,
-        num_register_tokens=4,
-        interpolate_antialias=True,
-        interpolate_offset=0.0,
-        **kwargs,
-    )
-def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
-    """
-    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
-    """
-    return _make_dinov2_model(
-        arch_name="vit_giant2",
-        ffn_layer="swiglufused",
-        weights=weights,
-        pretrained=pretrained,
-        num_register_tokens=4,
-        interpolate_antialias=True,
-        interpolate_offset=0.0,
-        **kwargs,
-    )

svrm/ldm/modules/encoders/dinov2/hub/utils.py DELETED Viewed

@@ -1,39 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the Apache License, Version 2.0
-# found in the LICENSE file in the root directory of this source tree.
-import itertools
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
-def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
-    compact_arch_name = arch_name.replace("_", "")[:4]
-    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
-    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
-class CenterPadding(nn.Module):
-    def __init__(self, multiple):
-        super().__init__()
-        self.multiple = multiple
-    def _get_pad(self, size):
-        new_size = math.ceil(size / self.multiple) * self.multiple
-        pad_size = new_size - size
-        pad_size_left = pad_size // 2
-        pad_size_right = pad_size - pad_size_left
-        return pad_size_left, pad_size_right
-    @torch.inference_mode()
-    def forward(self, x):
-        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
-        output = F.pad(x, pads)
-        return output

svrm/ldm/modules/encoders/dinov2/layers/__init__.py DELETED Viewed

@@ -1,11 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the Apache License, Version 2.0
-# found in the LICENSE file in the root directory of this source tree.
-from .dino_head import DINOHead
-from .mlp import Mlp
-from .patch_embed import PatchEmbed
-from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
-from .block import NestedTensorBlockMod
-from .attention import MemEffAttention

svrm/ldm/modules/encoders/dinov2/layers/attention.py DELETED Viewed

@@ -1,89 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the Apache License, Version 2.0
-# found in the LICENSE file in the root directory of this source tree.
-# References:
-#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
-#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
-import logging
-import os
-import warnings
-from torch import Tensor
-from torch import nn
-logger = logging.getLogger("dinov2")
-XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
-try:
-    if XFORMERS_ENABLED:
-        from xformers.ops import memory_efficient_attention, unbind
-        XFORMERS_AVAILABLE = True
-        warnings.warn("xFormers is available (Attention)")
-    else:
-        warnings.warn("xFormers is disabled (Attention)")
-        raise ImportError
-except ImportError:
-    XFORMERS_AVAILABLE = False
-    warnings.warn("xFormers is not available (Attention)")
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int = 8,
-        qkv_bias: bool = False,
-        proj_bias: bool = True,
-        attn_drop: float = 0.0,
-        proj_drop: float = 0.0,
-    ) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = head_dim**-0.5
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim, bias=proj_bias)
-        self.proj_drop = nn.Dropout(proj_drop)
-    def forward(self, x: Tensor) -> Tensor:
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
-        attn = q @ k.transpose(-2, -1)
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-class MemEffAttention(Attention):
-    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
-        if not XFORMERS_AVAILABLE:
-            if attn_bias is not None:
-                raise AssertionError("xFormers is required for using nested tensors")
-            return super().forward(x)
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
-        q, k, v = unbind(qkv, 2)
-        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
-        x = x.reshape([B, N, C])
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x

svrm/ldm/modules/encoders/dinov2/layers/block.py DELETED Viewed

@@ -1,269 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the Apache License, Version 2.0
-# found in the LICENSE file in the root directory of this source tree.
-# References:
-#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
-#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
-import os
-import logging
-import warnings
-from typing import Callable, List, Any, Tuple, Dict
-import torch
-from torch import nn, Tensor
-from .attention import Attention, MemEffAttention
-from .drop_path import DropPath
-from .layer_scale import LayerScale
-from .mlp import Mlp
-from ....attention import AdaNorm
-logger = logging.getLogger("dinov2")
-XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
-try:
-    if XFORMERS_ENABLED:
-        from xformers.ops import fmha, scaled_index_add, index_select_cat
-        XFORMERS_AVAILABLE = True
-        warnings.warn("xFormers is available (Block)")
-    else:
-        warnings.warn("xFormers is disabled (Block)")
-        raise ImportError
-except ImportError:
-    XFORMERS_AVAILABLE = False
-    warnings.warn("xFormers is not available (Block)")
-class BlockMod(nn.Module):
-    '''
-        using Modified Block, see below
-    '''
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qkv_bias: bool = False,
-        proj_bias: bool = True,
-        ffn_bias: bool = True,
-        drop: float = 0.0,
-        attn_drop: float = 0.0,
-        init_values=None,
-        drop_path: float = 0.0,
-        act_layer: Callable[..., nn.Module] = nn.GELU,
-        norm_layer: Callable[..., nn.Module] = AdaNorm,
-        attn_class: Callable[..., nn.Module] = Attention,
-        ffn_layer: Callable[..., nn.Module] = Mlp,
-    ) -> None:
-        super().__init__()
-        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
-        self.norm1 = norm_layer(dim)
-        self.attn = attn_class(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            proj_bias=proj_bias,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-        )
-        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
-        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = ffn_layer(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop,
-            bias=ffn_bias,
-        )
-        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
-        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.sample_drop_ratio = drop_path
-    def forward(self, x: Tensor, cam_emb: Tensor) -> Tensor:
-        def attn_residual_func(x: Tensor, cam_emb: Tensor = None) -> Tensor:
-            return self.ls1(self.attn(self.norm1(x, cam_emb)))
-        def ffn_residual_func(x: Tensor, cam_emb: Tensor = None) -> Tensor:
-            return self.ls2(self.mlp(self.norm2(x, cam_emb)))
-        if self.training and self.sample_drop_ratio > 0.1:
-            # the overhead is compensated only for a drop path rate larger than 0.1
-            x = drop_add_residual_stochastic_depth(
-                x,
-                residual_func=attn_residual_func,
-                sample_drop_ratio=self.sample_drop_ratio,
-            )
-            x = drop_add_residual_stochastic_depth(
-                x,
-                residual_func=ffn_residual_func,
-                sample_drop_ratio=self.sample_drop_ratio,
-            )
-        elif self.training and self.sample_drop_ratio > 0.0:
-            x = x + self.drop_path1(attn_residual_func(x, cam_emb))
-            x = x + self.drop_path1(ffn_residual_func(x, cam_emb))  # FIXME: drop_path2
-        else:
-            x = x + attn_residual_func(x, cam_emb)
-            x = x + ffn_residual_func(x, cam_emb)
-        return x
-def drop_add_residual_stochastic_depth(
-    x: Tensor,
-    residual_func: Callable[[Tensor], Tensor],
-    sample_drop_ratio: float = 0.0,
-) -> Tensor:
-    # drop_add_residual_stochastic_depth_list
-    # 1) extract subset using permutation
-    b, n, d = x.shape
-    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
-    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
-    x_subset = x[brange]
-    # 2) apply residual_func to get residual
-    residual = residual_func(x_subset)
-    x_flat = x.flatten(1)
-    residual = residual.flatten(1)
-    residual_scale_factor = b / sample_subset_size
-    # 3) add the residual
-    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
-    return x_plus_residual.view_as(x)
-def get_branges_scales(x, sample_drop_ratio=0.0):
-    # get_branges_scales
-    b, n, d = x.shape
-    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
-    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
-    residual_scale_factor = b / sample_subset_size
-    return brange, residual_scale_factor
-def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
-    # add residuals
-    if scaling_vector is None:
-        x_flat = x.flatten(1)
-        residual = residual.flatten(1)
-        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
-    else:
-        x_plus_residual = scaled_index_add(
-            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
-        )
-    return x_plus_residual
-attn_bias_cache: Dict[Tuple, Any] = {}
-def get_attn_bias_and_cat(x_list, branges=None):
-    """
-    this will perform the index select, cat the tensors, and provide the attn_bias from cache
-    """
-    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
-    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
-    if all_shapes not in attn_bias_cache.keys():
-        seqlens = []
-        for b, x in zip(batch_sizes, x_list):
-            for _ in range(b):
-                seqlens.append(x.shape[1])
-        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
-        attn_bias._batch_sizes = batch_sizes
-        attn_bias_cache[all_shapes] = attn_bias
-    if branges is not None:
-        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
-    else:
-        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
-        cat_tensors = torch.cat(tensors_bs1, dim=1)
-    return attn_bias_cache[all_shapes], cat_tensors
-def drop_add_residual_stochastic_list(
-    x_list: List[Tensor],
-    residual_func: Callable[[Tensor, Any], Tensor],
-    sample_drop_ratio: float = 0.0,
-    scaling_vector=None,
-) -> Tensor:
-    # 1) generate random set of indices for dropping samples in the batch
-    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
-    branges = [s[0] for s in branges_scales]
-    residual_scale_factors = [s[1] for s in branges_scales]
-    # 2) get attention bias and index+concat the tensors
-    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
-    # 3) apply residual_func to get residual, and split the result
-    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
-    outputs = []
-    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
-        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
-    return outputs
-class NestedTensorBlockMod(BlockMod):
-    def forward_nested(self, x_list: List[Tensor], cam_emb_list: List[Tensor]) -> List[Tensor]:
-        """
-        x_list contains a list of tensors to nest together and run
-        """
-        assert isinstance(self.attn, MemEffAttention)
-        if self.training and self.sample_drop_ratio > 0.0:
-            def attn_residual_func(x: Tensor, cam_emb: Tensor, attn_bias=None) -> Tensor:
-                return self.attn(self.norm1(x, cam_emb), attn_bias=attn_bias)
-            def ffn_residual_func(x: Tensor, cam_emb: Tensor, attn_bias=None) -> Tensor:
-                return self.mlp(self.norm2(x, cam_emb))
-            x_list = drop_add_residual_stochastic_list(
-                x_list,
-                residual_func=attn_residual_func,
-                sample_drop_ratio=self.sample_drop_ratio,
-                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
-            )
-            x_list = drop_add_residual_stochastic_list(
-                x_list,
-                residual_func=ffn_residual_func,
-                sample_drop_ratio=self.sample_drop_ratio,
-                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
-            )
-            return x_list
-        else:
-            def attn_residual_func(x: Tensor, cam_emb: Tensor, attn_bias=None) -> Tensor:
-                return self.ls1(self.attn(self.norm1(x, cam_emb), attn_bias=attn_bias))
-            def ffn_residual_func(x: Tensor, cam_emb: Tensor, attn_bias=None) -> Tensor:
-                return self.ls2(self.mlp(self.norm2(x, cam_emb)))
-            attn_bias, x = get_attn_bias_and_cat(x_list)
-            x = x + attn_residual_func(x, attn_bias=attn_bias)
-            x = x + ffn_residual_func(x)
-            return attn_bias.split(x)
-    def forward(self, x_or_x_list, cam_emb_or_cam_emb_list):
-        if isinstance(x_or_x_list, Tensor) and isinstance(cam_emb_or_cam_emb_list, Tensor) :
-            return super().forward(x_or_x_list, cam_emb_or_cam_emb_list)
-        elif isinstance(x_or_x_list, list) and isinstance(cam_emb_or_cam_emb_list, list):
-            if not XFORMERS_AVAILABLE:
-                raise AssertionError("xFormers is required for using nested tensors")
-            return self.forward_nested(x_or_x_list, cam_emb_or_cam_emb_list)
-        else:
-            raise AssertionError

svrm/ldm/modules/encoders/dinov2/layers/dino_head.py DELETED Viewed

@@ -1,58 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the Apache License, Version 2.0
-# found in the LICENSE file in the root directory of this source tree.
-import torch
-import torch.nn as nn
-from torch.nn.init import trunc_normal_
-from torch.nn.utils import weight_norm
-class DINOHead(nn.Module):
-    def __init__(
-        self,
-        in_dim,
-        out_dim,
-        use_bn=False,
-        nlayers=3,
-        hidden_dim=2048,
-        bottleneck_dim=256,
-        mlp_bias=True,
-    ):
-        super().__init__()
-        nlayers = max(nlayers, 1)
-        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
-        self.apply(self._init_weights)
-        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
-        self.last_layer.weight_g.data.fill_(1)
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-    def forward(self, x):
-        x = self.mlp(x)
-        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
-        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
-        x = self.last_layer(x)
-        return x
-def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
-    if nlayers == 1:
-        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
-    else:
-        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
-        if use_bn:
-            layers.append(nn.BatchNorm1d(hidden_dim))
-        layers.append(nn.GELU())
-        for _ in range(nlayers - 2):
-            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
-            if use_bn:
-                layers.append(nn.BatchNorm1d(hidden_dim))
-            layers.append(nn.GELU())
-        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
-        return nn.Sequential(*layers)

svrm/ldm/modules/encoders/dinov2/layers/drop_path.py DELETED Viewed

@@ -1,34 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the Apache License, Version 2.0
-# found in the LICENSE file in the root directory of this source tree.
-# References:
-#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
-#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
-from torch import nn
-def drop_path(x, drop_prob: float = 0.0, training: bool = False):
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-    if keep_prob > 0.0:
-        random_tensor.div_(keep_prob)
-    output = x * random_tensor
-    return output
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)

svrm/ldm/modules/encoders/dinov2/layers/layer_scale.py DELETED Viewed

@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the Apache License, Version 2.0
-# found in the LICENSE file in the root directory of this source tree.
-# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
-from typing import Union
-import torch
-from torch import Tensor
-from torch import nn
-class LayerScale(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        init_values: Union[float, Tensor] = 1e-5,
-        inplace: bool = False,
-    ) -> None:
-        super().__init__()
-        self.inplace = inplace
-        self.gamma = nn.Parameter(init_values * torch.ones(dim))
-    def forward(self, x: Tensor) -> Tensor:
-        return x.mul_(self.gamma) if self.inplace else x * self.gamma

svrm/ldm/modules/encoders/dinov2/layers/mlp.py DELETED Viewed

@@ -1,40 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the Apache License, Version 2.0
-# found in the LICENSE file in the root directory of this source tree.
-# References:
-#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
-#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
-from typing import Callable, Optional
-from torch import Tensor, nn
-class Mlp(nn.Module):
-    def __init__(
-        self,
-        in_features: int,
-        hidden_features: Optional[int] = None,
-        out_features: Optional[int] = None,
-        act_layer: Callable[..., nn.Module] = nn.GELU,
-        drop: float = 0.0,
-        bias: bool = True,
-    ) -> None:
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
-        self.drop = nn.Dropout(drop)
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x