Versatile-Diffusion

Build error

App Files Files Community

osanseviero

JamesXu commited on Nov 16, 2022

Commit

67a8158

0 Parent(s):

Duplicate from shi-labs/Versatile-Diffusion

Browse files

Co-authored-by: Xingqian Xu <[email protected]>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
.gitignore +7 -0
README.md +15 -0
app.py +729 -0
assets/benz.jpg +3 -0
assets/boy_and_girl.jpg +3 -0
assets/church.jpg +3 -0
assets/firework.jpg +3 -0
assets/ghibli.jpg +3 -0
assets/horse.png +3 -0
assets/house_by_lake.jpg +3 -0
assets/matisse.jpg +3 -0
assets/night_light.jpg +3 -0
assets/penguin.png +3 -0
assets/san_diego.jpg +3 -0
assets/scream.jpg +3 -0
assets/space.jpg +3 -0
assets/tiger.jpg +3 -0
assets/train.jpg +3 -0
assets/vermeer.jpg +3 -0
configs/model/clip.yaml +50 -0
configs/model/openai_unet.yaml +72 -0
configs/model/optimus.yaml +102 -0
configs/model/sd.yaml +68 -0
configs/model/vd.yaml +61 -0
lib/__init__.py +0 -0
lib/cfg_helper.py +664 -0
lib/cfg_holder.py +28 -0
lib/data_factory/__init__.py +6 -0
lib/data_factory/common/__init__.py +6 -0
lib/data_factory/common/ds_base.py +272 -0
lib/data_factory/common/ds_estimator.py +39 -0
lib/data_factory/common/ds_formatter.py +37 -0
lib/data_factory/common/ds_loader.py +96 -0
lib/data_factory/common/ds_sampler.py +273 -0
lib/data_factory/common/ds_transform.py +177 -0
lib/evaluator/__init__.py +1 -0
lib/evaluator/eva_base.py +292 -0
lib/evaluator/eva_null.py +25 -0
lib/experiments/__init__.py +0 -0
lib/experiments/sd_default.py +441 -0
lib/log_service.py +166 -0
lib/model_zoo/__init__.py +4 -0
lib/model_zoo/attention.py +435 -0
lib/model_zoo/autoencoder.py +428 -0
lib/model_zoo/bert.py +142 -0
lib/model_zoo/clip.py +226 -0
lib/model_zoo/clip_justin/__init__.py +1 -0
lib/model_zoo/clip_justin/clip.py +237 -0
lib/model_zoo/clip_justin/model.py +436 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__
+.vscode/
+src/
+data/
+data
+log/
+log

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+title: Versatile Diffusion
+emoji: null
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 3.9.1
+app_file: app.py
+pinned: false
+license: mit
+python_version: 3.8.5
+duplicated_from: shi-labs/Versatile-Diffusion
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,729 @@

+import gradio as gr
+import os
+import PIL
+from PIL import Image
+from pathlib import Path
+import numpy as np
+import numpy.random as npr
+from contextlib import nullcontext
+import torch
+import torchvision.transforms as tvtrans
+from lib.cfg_helper import model_cfg_bank
+from lib.model_zoo import get_model
+from lib.model_zoo.ddim_vd import DDIMSampler_VD, DDIMSampler_VD_DualContext
+from lib.model_zoo.ddim_dualcontext import DDIMSampler_DualContext
+from lib.experiments.sd_default import color_adjust
+n_sample_image = 2
+n_sample_text = 4
+cache_examples = True
+class vd_inference(object):
+    def __init__(self, type='official'):
+        if type in ['dc', '2-flow']:
+            cfgm_name = 'vd_dc_noema'
+            sampler = DDIMSampler_DualContext
+            pth = 'pretrained/vd-dc.pth'
+        elif type in ['official', '4-flow']:
+            cfgm_name = 'vd_noema'
+            sampler = DDIMSampler_VD
+            pth = 'pretrained/vd-official.pth'
+        cfgm = model_cfg_bank()(cfgm_name)
+        net = get_model()(cfgm)
+        sd = torch.load(pth, map_location='cpu')
+        net.load_state_dict(sd, strict=False)
+        self.use_cuda = torch.cuda.is_available()
+        if self.use_cuda:
+            net.to('cuda')
+        self.model_name = cfgm_name
+        self.net = net
+        self.sampler = sampler(net)
+    def regularize_image(self, x):
+        BICUBIC = PIL.Image.Resampling.BICUBIC
+        if isinstance(x, str):
+            x = Image.open(x).resize([512, 512], resample=BICUBIC)
+            x = tvtrans.ToTensor()(x)
+        elif isinstance(x, PIL.Image.Image):
+            x = x.resize([512, 512], resample=BICUBIC)
+            x = tvtrans.ToTensor()(x)
+        elif isinstance(x, np.ndarray):
+            x = PIL.Image.fromarray(x).resize([512, 512], resample=BICUBIC)
+            x = tvtrans.ToTensor()(x)
+        elif isinstance(x, torch.Tensor):
+            pass
+        else:
+            assert False, 'Unknown image type'
+        assert (x.shape[1]==512) & (x.shape[2]==512), \
+            'Wrong image size'
+        if self.use_cuda:
+            x = x.to('cuda')
+        return x
+    def decode(self, z, xtype, ctype, color_adj='None', color_adj_to=None):
+        net = self.net
+        if xtype == 'image':
+            x = net.autokl_decode(z)
+            color_adj_flag = (color_adj!='None') and (color_adj is not None)
+            color_adj_simple = color_adj=='Simple'
+            color_adj_keep_ratio = 0.5
+            if color_adj_flag and (ctype=='vision'):
+                x_adj = []
+                for xi in x:
+                    color_adj_f = color_adjust(ref_from=(xi+1)/2, ref_to=color_adj_to)
+                    xi_adj = color_adj_f((xi+1)/2, keep=color_adj_keep_ratio, simple=color_adj_simple)
+                    x_adj.append(xi_adj)
+                x = x_adj
+            else:
+                x = torch.clamp((x+1.0)/2.0, min=0.0, max=1.0)
+                x = [tvtrans.ToPILImage()(xi) for xi in x]
+            return x
+        elif xtype == 'text':
+            prompt_temperature = 1.0
+            prompt_merge_same_adj_word = True
+            x = net.optimus_decode(z, temperature=prompt_temperature)
+            if prompt_merge_same_adj_word:
+                xnew = []
+                for xi in x:
+                    xi_split = xi.split()
+                    xinew = []
+                    for idxi, wi in enumerate(xi_split):
+                        if idxi!=0 and wi==xi_split[idxi-1]:
+                            continue
+                        xinew.append(wi)
+                    xnew.append(' '.join(xinew))
+                x = xnew
+            return x
+    def inference(self, xtype, cin, ctype, scale=7.5, n_samples=None, color_adj=None,):
+        net = self.net
+        sampler = self.sampler
+        ddim_steps = 50
+        ddim_eta = 0.0
+        if xtype == 'image':
+            n_samples = n_sample_image if n_samples is None else n_samples
+        elif xtype == 'text':
+            n_samples = n_sample_text if n_samples is None else n_samples
+        if ctype in ['prompt', 'text']:
+            c = net.clip_encode_text(n_samples * [cin])
+            u = None
+            if scale != 1.0:
+                u = net.clip_encode_text(n_samples * [""])
+        elif ctype in ['vision', 'image']:
+            cin = self.regularize_image(cin)
+            ctemp = cin*2 - 1
+            ctemp = ctemp[None].repeat(n_samples, 1, 1, 1)
+            c = net.clip_encode_vision(ctemp)
+            u = None
+            if scale != 1.0:
+                dummy = torch.zeros_like(ctemp)
+                u = net.clip_encode_vision(dummy)
+        if xtype == 'image':
+            h, w = [512, 512]
+            shape = [n_samples, 4, h//8, w//8]
+            z, _ = sampler.sample(
+                steps=ddim_steps,
+                shape=shape,
+                conditioning=c,
+                unconditional_guidance_scale=scale,
+                unconditional_conditioning=u,
+                xtype=xtype, ctype=ctype,
+                eta=ddim_eta,
+                verbose=False,)
+            x = self.decode(z, xtype, ctype, color_adj=color_adj, color_adj_to=cin)
+            return x
+        elif xtype == 'text':
+            n = 768
+            shape = [n_samples, n]
+            z, _ = sampler.sample(
+                steps=ddim_steps,
+                shape=shape,
+                conditioning=c,
+                unconditional_guidance_scale=scale,
+                unconditional_conditioning=u,
+                xtype=xtype, ctype=ctype,
+                eta=ddim_eta,
+                verbose=False,)
+            x = self.decode(z, xtype, ctype)
+            return x
+    def application_disensemble(self, cin, n_samples=None, level=0, color_adj=None,):
+        net = self.net
+        scale = 7.5
+        sampler = self.sampler
+        ddim_steps = 50
+        ddim_eta = 0.0
+        n_samples = n_sample_image if n_samples is None else n_samples
+        cin = self.regularize_image(cin)
+        ctemp = cin*2 - 1
+        ctemp = ctemp[None].repeat(n_samples, 1, 1, 1)
+        c = net.clip_encode_vision(ctemp)
+        u = None
+        if scale != 1.0:
+            dummy = torch.zeros_like(ctemp)
+            u = net.clip_encode_vision(dummy)
+        if level == 0:
+            pass
+        else:
+            c_glb = c[:, 0:1]
+            c_loc = c[:, 1: ]
+            u_glb = u[:, 0:1]
+            u_loc = u[:, 1: ]
+            if level == -1:
+                c_loc = self.remove_low_rank(c_loc, demean=True, q=50, q_remove=1)
+                u_loc = self.remove_low_rank(u_loc, demean=True, q=50, q_remove=1)
+            if level == -2:
+                c_loc = self.remove_low_rank(c_loc, demean=True, q=50, q_remove=2)
+                u_loc = self.remove_low_rank(u_loc, demean=True, q=50, q_remove=2)
+            if level == 1:
+                c_loc = self.find_low_rank(c_loc, demean=True, q=10)
+                u_loc = self.find_low_rank(u_loc, demean=True, q=10)
+            if level == 2:
+                c_loc = self.find_low_rank(c_loc, demean=True, q=2)
+                u_loc = self.find_low_rank(u_loc, demean=True, q=2)
+            c = torch.cat([c_glb, c_loc], dim=1)
+            u = torch.cat([u_glb, u_loc], dim=1)
+        h, w = [512, 512]
+        shape = [n_samples, 4, h//8, w//8]
+        z, _ = sampler.sample(
+            steps=ddim_steps,
+            shape=shape,
+            conditioning=c,
+            unconditional_guidance_scale=scale,
+            unconditional_conditioning=u,
+            xtype='image', ctype='vision',
+            eta=ddim_eta,
+            verbose=False,)
+        x = self.decode(z, 'image', 'vision', color_adj=color_adj, color_adj_to=cin)
+        return x
+    def find_low_rank(self, x, demean=True, q=20, niter=10):
+        if demean:
+            x_mean = x.mean(-1, keepdim=True)
+            x_input = x - x_mean
+        else:
+            x_input = x
+        u, s, v = torch.pca_lowrank(x_input, q=q, center=False, niter=niter)
+        ss = torch.stack([torch.diag(si) for si in s])
+        x_lowrank = torch.bmm(torch.bmm(u, ss), torch.permute(v, [0, 2, 1]))
+        if demean:
+            x_lowrank += x_mean
+        return x_lowrank
+    def remove_low_rank(self, x, demean=True, q=20, niter=10, q_remove=10):
+        if demean:
+            x_mean = x.mean(-1, keepdim=True)
+            x_input = x - x_mean
+        else:
+            x_input = x
+        u, s, v = torch.pca_lowrank(x_input, q=q, center=False, niter=niter)
+        s[:, 0:q_remove] = 0
+        ss = torch.stack([torch.diag(si) for si in s])
+        x_lowrank = torch.bmm(torch.bmm(u, ss), torch.permute(v, [0, 2, 1]))
+        if demean:
+            x_lowrank += x_mean
+        return x_lowrank
+    def application_dualguided(self, cim, ctx, n_samples=None, mixing=0.5, color_adj=None, ):
+        net = self.net
+        scale = 7.5
+        sampler = DDIMSampler_VD_DualContext(net)
+        ddim_steps = 50
+        ddim_eta = 0.0
+        n_samples = n_sample_image if n_samples is None else n_samples
+        ctemp0 = self.regularize_image(cim)
+        ctemp1 = ctemp0*2 - 1
+        ctemp1 = ctemp1[None].repeat(n_samples, 1, 1, 1)
+        cim = net.clip_encode_vision(ctemp1)
+        uim = None
+        if scale != 1.0:
+            dummy = torch.zeros_like(ctemp1)
+            uim = net.clip_encode_vision(dummy)
+        ctx = net.clip_encode_text(n_samples * [ctx])
+        utx = None
+        if scale != 1.0:
+            utx = net.clip_encode_text(n_samples * [""])
+        h, w = [512, 512]
+        shape = [n_samples, 4, h//8, w//8]
+        z, _ = sampler.sample_dc(
+            steps=ddim_steps,
+            shape=shape,
+            first_conditioning=[uim, cim],
+            second_conditioning=[utx, ctx],
+            unconditional_guidance_scale=scale,
+            xtype='image',
+            first_ctype='vision',
+            second_ctype='prompt',
+            eta=ddim_eta,
+            verbose=False,
+            mixed_ratio=(1-mixing), )
+        x = self.decode(z, 'image', 'vision', color_adj=color_adj, color_adj_to=ctemp0)
+        return x
+    def application_i2t2i(self, cim, ctx_n, ctx_p, n_samples=None, color_adj=None,):
+        net = self.net
+        scale = 7.5
+        sampler = DDIMSampler_VD_DualContext(net)
+        ddim_steps = 50
+        ddim_eta = 0.0
+        prompt_temperature = 1.0
+        n_samples = n_sample_image if n_samples is None else n_samples
+        ctemp0 = self.regularize_image(cim)
+        ctemp1 = ctemp0*2 - 1
+        ctemp1 = ctemp1[None].repeat(n_samples, 1, 1, 1)
+        cim = net.clip_encode_vision(ctemp1)
+        uim = None
+        if scale != 1.0:
+            dummy = torch.zeros_like(ctemp1)
+            uim = net.clip_encode_vision(dummy)
+        n = 768
+        shape = [n_samples, n]
+        zt, _ = sampler.sample(
+            steps=ddim_steps,
+            shape=shape,
+            conditioning=cim,
+            unconditional_guidance_scale=scale,
+            unconditional_conditioning=uim,
+            xtype='text', ctype='vision',
+            eta=ddim_eta,
+            verbose=False,)
+        ztn = net.optimus_encode([ctx_n])
+        ztp = net.optimus_encode([ctx_p])
+        ztn_norm = ztn / ztn.norm(dim=1)
+        zt_proj_mag = torch.matmul(zt, ztn_norm[0])
+        zt_perp = zt - zt_proj_mag[:, None] * ztn_norm
+        zt_newd = zt_perp + ztp
+        ctx_new = net.optimus_decode(zt_newd, temperature=prompt_temperature)
+        ctx_new = net.clip_encode_text(ctx_new)
+        ctx_p = net.clip_encode_text([ctx_p])
+        ctx_new = torch.cat([ctx_new, ctx_p.repeat(n_samples, 1, 1)], dim=1)
+        utx_new = net.clip_encode_text(n_samples * [""])
+        utx_new = torch.cat([utx_new, utx_new], dim=1)
+        cim_loc = cim[:, 1: ]
+        cim_loc_new = self.find_low_rank(cim_loc, demean=True, q=10)
+        cim_new = cim_loc_new
+        uim_new = uim[:, 1:]
+        h, w = [512, 512]
+        shape = [n_samples, 4, h//8, w//8]
+        z, _ = sampler.sample_dc(
+            steps=ddim_steps,
+            shape=shape,
+            first_conditioning=[uim_new, cim_new],
+            second_conditioning=[utx_new, ctx_new],
+            unconditional_guidance_scale=scale,
+            xtype='image',
+            first_ctype='vision',
+            second_ctype='prompt',
+            eta=ddim_eta,
+            verbose=False,
+            mixed_ratio=0.33, )
+        x = self.decode(z, 'image', 'vision', color_adj=color_adj, color_adj_to=ctemp0)
+        return x
+vd_inference = vd_inference('official')
+def main(mode,
+         image=None,
+         prompt=None,
+         nprompt=None,
+         pprompt=None,
+         color_adj=None,
+         disentanglement_level=None,
+         dual_guided_mixing=None,
+         seed=0,):
+    if seed<0:
+        seed = 0
+    np.random.seed(seed)
+    torch.manual_seed(seed+100)
+    if mode == 'Text-to-Image':
+        if (prompt is None) or (prompt == ""):
+            return None, None
+        with torch.no_grad():
+            rv = vd_inference.inference(
+                xtype = 'image',
+                cin = prompt,
+                ctype = 'prompt', )
+        return rv, None
+    elif mode == 'Image-Variation':
+        if image is None:
+            return None, None
+        with torch.no_grad():
+            rv = vd_inference.inference(
+                xtype = 'image',
+                cin = image,
+                ctype = 'vision',
+                color_adj = color_adj,)
+        return rv, None
+    elif mode == 'Image-to-Text':
+        if image is None:
+            return None, None
+        with torch.no_grad():
+            rv = vd_inference.inference(
+                xtype = 'text',
+                cin = image,
+                ctype = 'vision',)
+        return None, '\n'.join(rv)
+    elif mode == 'Text-Variation':
+        if prompt is None:
+            return None, None
+        with torch.no_grad():
+            rv = vd_inference.inference(
+                xtype = 'text',
+                cin = prompt,
+                ctype = 'prompt',)
+        return None, '\n'.join(rv)
+    elif mode == 'Disentanglement':
+        if image is None:
+            return None, None
+        with torch.no_grad():
+            rv = vd_inference.application_disensemble(
+                cin = image,
+                level = disentanglement_level,
+                color_adj = color_adj,)
+        return rv, None
+    elif mode == 'Dual-Guided':
+        if (image is None) or (prompt is None) or (prompt==""):
+            return None, None
+        with torch.no_grad():
+            rv = vd_inference.application_dualguided(
+                cim = image,
+                ctx = prompt,
+                mixing = dual_guided_mixing,
+                color_adj = color_adj,)
+        return rv, None
+    elif mode == 'Latent-I2T2I':
+        if (image is None) or (nprompt is None) or (nprompt=="") \
+                or (pprompt is None) or (pprompt==""):
+            return None, None
+        with torch.no_grad():
+            rv = vd_inference.application_i2t2i(
+                cim = image,
+                ctx_n = nprompt,
+                ctx_p = pprompt,
+                color_adj = color_adj,)
+        return rv, None
+    else:
+        assert False, "No such mode!"
+def get_instruction(mode):
+    t2i_instruction = ["Generate image from text prompt."]
+    i2i_instruction = [
+        "Generate image conditioned on reference image.",
+        "Color Calibration provide an opinion to adjust image color according to reference image.", ]
+    i2t_instruction = ["Generate text from reference image."]
+    t2t_instruction = ["Generate text from reference text prompt. (Model insufficiently trained, thus results are still experimental)"]
+    dis_instruction = [
+        "Generate a variation of reference image that disentangled for semantic or style.",
+        "Color Calibration provide an opinion to adjust image color according to reference image.",
+        "Disentanglement level controls the level of focus towards semantic (-2, -1) or style (1 2). Level 0 serves as Image-Variation.", ]
+    dug_instruction = [
+        "Generate image from dual guidance of reference image and text prompt.",
+        "Color Calibration provide an opinion to adjust image color according to reference image.",
+        "Guidance Mixing provides linear balances between image and text context. (0 towards image, 1 towards text)", ]
+    iti_instruction = [
+        "Generate image variations via image-to-text, text-latent-editing, and then text-to-image. (Still under exploration)",
+        "Color Calibration provide an opinion to adjust image color according to reference image.",
+        "Input prompt that will be substract from text/text latent code.",
+        "Input prompt that will be added to text/text latent code.", ]
+    if mode == "Text-to-Image":
+        return '\n'.join(t2i_instruction)
+    elif mode == "Image-Variation":
+        return '\n'.join(i2i_instruction)
+    elif mode == "Image-to-Text":
+        return '\n'.join(i2t_instruction)
+    elif mode == "Text-Variation":
+        return '\n'.join(t2t_instruction)
+    elif mode == "Disentanglement":
+        return '\n'.join(dis_instruction)
+    elif mode == "Dual-Guided":
+        return '\n'.join(dug_instruction)
+    elif mode == "Latent-I2T2I":
+        return '\n'.join(iti_instruction)
+#############
+# Interface #
+#############
+if True:
+    img_output = gr.Gallery(label="Image Result").style(grid=n_sample_image)
+    txt_output = gr.Textbox(lines=4, label='Text Result', visible=False)
+    with gr.Blocks() as demo:
+        gr.HTML(
+            """
+            <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
+            <h1 style="font-weight: 900; font-size: 3rem;">
+                Versatile Diffusion
+            </h1>
+            <br>
+            <h2 style="font-weight: 450; font-size: 1rem;">
+            We built <b>Versatile Diffusion (VD), the first unified multi-flow multimodal diffusion framework</b>, as a step towards <b>Universal Generative AI</b>.
+            VD can natively support image-to-text, image-variation, text-to-image, and text-variation,
+            and can be further extended to other applications such as
+            semantic-style disentanglement, image-text dual-guided generation, latent image-to-text-to-image editing, and more.
+            Future versions will support more modalities such as speech, music, video and 3D.
+            </h2>
+            <br>
+            <h3>Xingqian Xu, Atlas Wang, Eric Zhang, Kai Wang,
+            and <a href="https://www.humphreyshi.com/home">Humphrey Shi</a>
+            [<a href="https://arxiv.org/abs/2211.08332" style="color:blue;">arXiv</a>]
+            [<a href="https://github.com/SHI-Labs/Versatile-Diffusion" style="color:blue;">GitHub</a>]
+            </h3>
+            </div>
+            """)
+        mode_input = gr.Radio([
+            "Text-to-Image", "Image-Variation", "Image-to-Text", "Text-Variation",
+            "Disentanglement", "Dual-Guided", "Latent-I2T2I"], value='Text-to-Image', label="VD Flows and Applications")
+        instruction = gr.Textbox(get_instruction("Text-to-Image"), label='Info')
+        with gr.Row():
+            with gr.Column():
+                img_input = gr.Image(label='Image Input', visible=False)
+                txt_input = gr.Textbox(lines=4, placeholder="Input prompt...", label='Text Input')
+                ntxt_input = gr.Textbox(label='Remove Prompt', visible=False)
+                ptxt_input = gr.Textbox(label='Add Prompt', visible=False)
+                coladj_input = gr.Radio(["None", "Simple"], value='Simple', label="Color Calibration", visible=False)
+                dislvl_input = gr.Slider(-2, 2, value=0, step=1, label="Disentanglement level", visible=False)
+                dguide_input = gr.Slider(0, 1, value=0.5, step=0.01, label="Guidance Mixing", visible=False)
+                seed_input = gr.Number(100, label="Seed", precision=0)
+                btn = gr.Button("Run")
+                btn.click(
+                    main,
+                    inputs=[
+                        mode_input,
+                        img_input,
+                        txt_input,
+                        ntxt_input,
+                        ptxt_input,
+                        coladj_input,
+                        dislvl_input,
+                        dguide_input,
+                        seed_input, ],
+                    outputs=[img_output, txt_output])
+            with gr.Column():
+                img_output.render()
+                txt_output.render()
+        example_mode = [
+            "Text-to-Image",
+            "Image-Variation",
+            "Image-to-Text",
+            "Text-Variation",
+            "Disentanglement",
+            "Dual-Guided",
+            "Latent-I2T2I"]
+        def get_example(mode):
+            if mode == 'Text-to-Image':
+                case = [
+                    ['a dream of a village in china, by Caspar David Friedrich, matte painting trending on artstation HQ', 23],
+                    ['a beautiful grand nebula in the universe', 24],
+                    ['heavy arms gundam penguin mech', 25],
+                ]
+            elif mode == "Image-Variation":
+                case = [
+                    ['assets/space.jpg', 'None', 26],
+                    ['assets/train.jpg', 'Simple', 27],
+                ]
+            elif mode == "Image-to-Text":
+                case = [
+                    ['assets/boy_and_girl.jpg' , 28],
+                    ['assets/house_by_lake.jpg', 29],
+                ]
+            elif mode == "Text-Variation":
+                case = [
+                    ['a dream of a village in china, by Caspar David Friedrich, matte painting trending on artstation HQ' , 32],
+                    ['a beautiful grand nebula in the universe' , 33],
+                    ['heavy arms gundam penguin mech', 34],
+                ]
+            elif mode == "Disentanglement":
+                case = [
+                    ['assets/vermeer.jpg', 'Simple', -2, 30],
+                    ['assets/matisse.jpg', 'Simple',  2, 31],
+                ]
+            elif mode == "Dual-Guided":
+                case = [
+                    ['assets/benz.jpg',    'cyberpunk 2077', 'Simple', 0.75, 22],
+                    ['assets/vermeer.jpg', 'a girl with a diamond necklace',  'Simple', 0.66, 21],
+                ]
+            elif mode == "Latent-I2T2I":
+                case = [
+                    ['assets/ghibli.jpg',  'white house', 'tall castle', 'Simple', 20],
+                    ['assets/matisse.jpg', 'fruits and bottles on the table', 'flowers on the table', 'Simple', 21],
+                ]
+            else:
+                raise ValueError
+            case = [[mode] + casei for casei in case]
+            return case
+        def get_example_iof(mode):
+            if mode == 'Text-to-Image':
+                inps = [txt_input, seed_input]
+                oups = [img_output]
+                fn = lambda m, x, y: \
+                    main(mode=m, prompt=x, seed=y)[0]
+            elif mode == "Image-Variation":
+                inps = [img_input, coladj_input, seed_input]
+                oups = [img_output]
+                fn = lambda m, x, y, z: \
+                    main(mode=m, image=x, color_adj=y, seed=z)[0]
+            elif mode == "Image-to-Text":
+                inps = [img_input, seed_input]
+                oups = [txt_output]
+                fn = lambda m, x, y: \
+                    main(mode=m, image=x, seed=y)[1]
+            elif mode == "Text-Variation":
+                inps = [txt_input, seed_input]
+                oups = [txt_output]
+                fn = lambda m, x, y: \
+                    main(mode=m, prompt=x, seed=y)[1]
+            elif mode == "Disentanglement":
+                inps = [img_input, coladj_input, dislvl_input, seed_input]
+                oups = [img_output]
+                fn = lambda m, x, y, z, w: \
+                    main(mode=m, image=x, color_adj=y, disentanglement_level=z, seed=w)[0]
+            elif mode == "Dual-Guided":
+                inps = [img_input, txt_input, coladj_input, dguide_input, seed_input]
+                oups = [img_output]
+                fn = lambda m, x, y, z, w, u: \
+                    main(mode=m, image=x, prompt=y, color_adj=z, dual_guided_mixing=w, seed=u)[0]
+            elif mode == "Latent-I2T2I":
+                inps = [img_input, ntxt_input, ptxt_input, coladj_input, seed_input]
+                oups = [img_output]
+                fn = lambda m, x, y, z, w, u: \
+                    main(mode=m, image=x, nprompt=y, pprompt=z, color_adj=w, seed=u)[0]
+            else:
+                raise ValueError
+            return [mode_input]+inps, oups, fn
+        with gr.Row():
+            for emode in example_mode[0:4]:
+                with gr.Column():
+                    gr.Examples(
+                        label=emode+' Examples',
+                        examples=get_example(emode),
+                        inputs=get_example_iof(emode)[0],
+                        outputs=get_example_iof(emode)[1],
+                        fn = get_example_iof(emode)[2],
+                        cache_examples=cache_examples),
+        with gr.Row():
+            for emode in example_mode[4:7]:
+                with gr.Column():
+                    gr.Examples(
+                        label=emode+' Examples',
+                        examples=get_example(emode),
+                        inputs=get_example_iof(emode)[0],
+                        outputs=get_example_iof(emode)[1],
+                        fn = get_example_iof(emode)[2],
+                        cache_examples=cache_examples),
+        mode_input.change(
+            fn=lambda x: gr.update(value=get_instruction(x)),
+            inputs=mode_input,
+            outputs=instruction,)
+        mode_input.change(
+            fn=lambda x: gr.update(visible=(x not in ['Text-to-Image', 'Text-Variation'])),
+            inputs=mode_input,
+            outputs=img_input,)
+        mode_input.change(
+            fn=lambda x: gr.update(visible=(x in ['Text-to-Image', 'Text-Variation', 'Dual-Guided'])),
+            inputs=mode_input,
+            outputs=txt_input,)
+        mode_input.change(
+            fn=lambda x: gr.update(visible=(x in ['Latent-I2T2I'])),
+            inputs=mode_input,
+            outputs=ntxt_input,)
+        mode_input.change(
+            fn=lambda x: gr.update(visible=(x in ['Latent-I2T2I'])),
+            inputs=mode_input,
+            outputs=ptxt_input,)
+        mode_input.change(
+            fn=lambda x: gr.update(visible=(x not in ['Text-to-Image', 'Image-to-Text', 'Text-Variation'])),
+            inputs=mode_input,
+            outputs=coladj_input,)
+        mode_input.change(
+            fn=lambda x: gr.update(visible=(x=='Disentanglement')),
+            inputs=mode_input,
+            outputs=dislvl_input,)
+        mode_input.change(
+            fn=lambda x: gr.update(visible=(x=='Dual-Guided')),
+            inputs=mode_input,
+            outputs=dguide_input,)
+        mode_input.change(
+            fn=lambda x: gr.update(visible=(x not in ['Image-to-Text', 'Text-Variation'])),
+            inputs=mode_input,
+            outputs=img_output,)
+        mode_input.change(
+            fn=lambda x: gr.update(visible=(x in ['Image-to-Text', 'Text-Variation'])),
+            inputs=mode_input,
+            outputs=txt_output,)
+        gr.HTML(
+            """
+            <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
+            <h3>
+            <b>Caution</b>:
+            We would like the raise the awareness of users of this demo of its potential issues and concerns.
+            Like previous large foundation models, Versatile Diffusion could be problematic in some cases, partially due to the imperfect training data and pretrained network (VAEs / context encoders) with limited scope.
+            In its future research phase, VD may do better on tasks such as text-to-image, image-to-text, etc., with the help of more powerful VAEs, more sophisticated network designs, and more cleaned data.
+            So far, we keep all features available for research testing both to show the great potential of the VD framework and to collect important feedback to improve the model in the future.
+            We welcome researchers and users to report issues with the HuggingFace community discussion feature or email the authors.
+            </h3>
+            <br>
+            <h3>
+            <b>Biases and content acknowledgement</b>:
+            Beware that VD may output content that reinforces or exacerbates societal biases, as well as realistic faces, pornography, and violence.
+            VD was trained on the LAION-2B dataset, which scraped non-curated online images and text, and may contained unintended exceptions as we removed illegal content.
+            VD in this demo is meant only for research purposes.
+            </h3>
+            </div>
+            """)
+    # demo.launch(share=True)
+    demo.launch(debug=True)

assets/benz.jpg ADDED Viewed

Git LFS Details

SHA256: bdfdfb603af2179878013b08500fdc78c5f20d70efd581f2ebfed1b65321f9a2
Pointer size: 131 Bytes
Size of remote file: 204 kB

assets/boy_and_girl.jpg ADDED Viewed

Git LFS Details

SHA256: aba3f4834a4f82fb65ff8e6c5e5a1b60d248d2e83d97321b98a0d24ba999390c
Pointer size: 131 Bytes
Size of remote file: 139 kB

assets/church.jpg ADDED Viewed

Git LFS Details

SHA256: ec3be4a83b1ceb43cfee1c5bd125f564e0b42a71c440e731fa9cecc2b761263d
Pointer size: 131 Bytes
Size of remote file: 338 kB

assets/firework.jpg ADDED Viewed

Git LFS Details

SHA256: 6040aeca347b2896de63b3bf9145e307ad06fa4ab0435609e1d7df5587c29bd6
Pointer size: 131 Bytes
Size of remote file: 279 kB

assets/ghibli.jpg ADDED Viewed

Git LFS Details

SHA256: 153e34326ce625f2a6c41d6922549ad690b63d8e18de43532e7fd9808cb9de8b
Pointer size: 131 Bytes
Size of remote file: 145 kB

assets/horse.png ADDED Viewed

Git LFS Details

SHA256: 27c5ba007e2984f2e8128df6418c780fefcbd940025ccece14a5a13894065457
Pointer size: 131 Bytes
Size of remote file: 395 kB

assets/house_by_lake.jpg ADDED Viewed

Git LFS Details

SHA256: 3d3dcc9f8d8eb90b69fb0a17967440b960bb1d545bcf85de37c4d08f9e5d4606
Pointer size: 131 Bytes
Size of remote file: 189 kB

assets/matisse.jpg ADDED Viewed

Git LFS Details

SHA256: ea0428092cbca5224b72a3665c140e96142b5df9c78b36b66f910f42093ecd4f
Pointer size: 131 Bytes
Size of remote file: 271 kB

assets/night_light.jpg ADDED Viewed

Git LFS Details

SHA256: 5103ce525e00f0f8ff3c83bcbf954ebda5deb869377a8ccd2b5b6362f7b0aa4a
Pointer size: 131 Bytes
Size of remote file: 213 kB

assets/penguin.png ADDED Viewed

Git LFS Details

SHA256: e22e87eec01455b342a849d868ea6cf893b8ae7d81da54eba2f620dbccf972ac
Pointer size: 131 Bytes
Size of remote file: 147 kB

assets/san_diego.jpg ADDED Viewed

Git LFS Details

SHA256: 491e93d1b0e99ae2223d85beb4cc98aa790c377b51a938721c3d0c62645a81e4
Pointer size: 131 Bytes
Size of remote file: 235 kB

assets/scream.jpg ADDED Viewed

Git LFS Details

SHA256: e154dbe35cb10c4f65c022e97ba9d6ccd7f6ebf3285ff56ebcd9c43b0246e309
Pointer size: 131 Bytes
Size of remote file: 246 kB

assets/space.jpg ADDED Viewed

Git LFS Details

SHA256: 7cb01b250297f088ecb1310d746a18b141ab26cd6f3f46e41c52285bb4c7e3d4
Pointer size: 131 Bytes
Size of remote file: 236 kB

assets/tiger.jpg ADDED Viewed

Git LFS Details

SHA256: a4b58a11be073fad21a218bcf6478a4da61532b8520176a6454542eb9368081d
Pointer size: 131 Bytes
Size of remote file: 272 kB

assets/train.jpg ADDED Viewed

Git LFS Details

SHA256: 50b45524dc627d0042ed789e4495250e71aae7f2935b9a0879d09cf73d8aff37
Pointer size: 131 Bytes
Size of remote file: 310 kB

assets/vermeer.jpg ADDED Viewed

Git LFS Details

SHA256: d884ddfe302572f6c4eb8607942cc59807b3dc8321f27ff9358b8e5a3657d015
Pointer size: 130 Bytes
Size of remote file: 65.5 kB

configs/model/clip.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+clip:
+  symbol: clip
+  args: {}
+clip_frozen:
+  super_cfg: clip
+  type: clip_frozen
+  args: {}
+clip_text_frozen:
+  super_cfg: clip
+  type: clip_text_frozen
+  args: {}
+clip_vision_frozen:
+  super_cfg: clip
+  type: clip_vision_frozen
+  args: {}
+############################
+# clip with focused encode #
+############################
+clip_frozen_encode_text:
+  super_cfg: clip
+  type: clip_frozen
+  args:
+    encode_type : encode_text
+clip_frozen_encode_vision:
+  super_cfg: clip
+  type: clip_frozen
+  args:
+    encode_type : encode_vision
+clip_frozen_encode_text_noproj:
+  super_cfg: clip
+  type: clip_frozen
+  args:
+    encode_type : encode_text_noproj
+#####################################
+# clip vision forzen justin version #
+#####################################
+clip_vision_frozen_justin:
+  super_cfg: clip
+  type: clip_vision_frozen_justin
+  args: {}

configs/model/openai_unet.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+openai_unet_sd:
+  type: openai_unet
+  args:
+    image_size: null # no use
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions: [ 4, 2, 1 ]
+    num_res_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    # disable_self_attentions: [ False, False, False, False ]  # converts the self-attention to a cross-attention layer if true
+    num_heads: 8
+    use_spatial_transformer: True
+    transformer_depth: 1
+    context_dim: 768
+    use_checkpoint: True
+    legacy: False
+openai_unet_dual_context:
+  super_cfg: openai_unet_sd
+  type: openai_unet_dual_context
+########################
+# Code cleaned version #
+########################
+openai_unet_2d:
+  type: openai_unet_2d
+  args:
+    input_channels: 4
+    model_channels: 320
+    output_channels: 4
+    num_noattn_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    with_attn: [true, true, true, false]
+    num_heads: 8
+    context_dim: 768
+    use_checkpoint: True
+openai_unet_0d:
+  type: openai_unet_0d
+  args:
+    input_channels: 768
+    model_channels: 320
+    output_channels: 768
+    num_noattn_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    with_attn: [true, true, true, false]
+    num_heads: 8
+    context_dim: 768
+    use_checkpoint: True
+openai_unet_0dmd:
+  type: openai_unet_0dmd
+  args:
+    input_channels: 768
+    model_channels: 320
+    output_channels: 768
+    num_noattn_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    second_dim: [ 4, 4, 4, 4 ]
+    with_attn: [true, true, true, false]
+    num_heads: 8
+    context_dim: 768
+    use_checkpoint: True
+openai_unet_vd:
+  type: openai_unet_vd
+  args:
+    unet_image_cfg: MODEL(openai_unet_2d)
+    unet_test_cfg: MODEL(openai_unet_0dmd)

configs/model/optimus.yaml ADDED Viewed

	@@ -0,0 +1,102 @@

+optimus:
+  symbol: optimus
+  find_unused_parameters: false
+  args: {}
+optimus_bert_encoder:
+  super_cfg: optimus
+  type: optimus_bert_connector
+  # pth: pretrained/optimus_bert_encoder.pth
+  args:
+    config:
+      architectures:
+        - BertForMaskedLM
+      attention_probs_dropout_prob: 0.1
+      finetuning_task: null
+      hidden_act: gelu
+      hidden_dropout_prob: 0.1
+      hidden_size: 768
+      initializer_range: 0.02
+      intermediate_size: 3072
+      layer_norm_eps: 1.e-12
+      max_position_embeddings: 512
+      num_attention_heads: 12
+      num_hidden_layers: 12
+      num_labels: 2
+      output_attentions: false
+      output_hidden_states: false
+      pruned_heads: {}
+      torchscript: false
+      type_vocab_size: 2
+      vocab_size: 28996
+    latent_size: 768
+optimus_bert_tokenizer:
+  super_cfg: optimus
+  type: optimus_bert_tokenizer
+  args:
+    do_lower_case: false
+    max_len: 512
+    vocab_file: lib/model_zoo/optimus_models/vocab/bert-base-cased-vocab.txt
+optimus_gpt2_decoder:
+  super_cfg: optimus
+  type: optimus_gpt2_connector
+  # pth: pretrained/optimus_gpt2_decoder.pth
+  args:
+    config:
+      architectures:
+        - GPT2LMHeadModel
+      attn_pdrop: 0.1
+      embd_pdrop: 0.1
+      finetuning_task: null
+      hidden_size: 768
+      initializer_range: 0.02
+      latent_size: 768
+      layer_norm_epsilon: 1.e-05
+      max_position_embeddings: 1024
+      n_ctx: 1024
+      n_embd: 768
+      n_head: 12
+      n_layer: 12
+      n_positions: 1024
+      num_attention_heads: 12
+      num_hidden_layers: 12
+      num_labels: 1
+      output_attentions: false
+      output_hidden_states: false
+      pretrained_config_archive_map:
+        gpt2        : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json
+        gpt2-medium : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json
+        gpt2-large  : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json
+      pruned_heads: {}
+      resid_pdrop: 0.1
+      summary_activation: null
+      summary_first_dropout: 0.1
+      summary_proj_to_labels: true
+      summary_type: cls_index
+      summary_use_proj: true
+      torchscript: false
+      vocab_size: 50260
+optimus_gpt2_tokenizer:
+  super_cfg: optimus
+  type: optimus_gpt2_tokenizer
+  args:
+    do_lower_case: false
+    max_len: 1024
+    vocab_file: lib/model_zoo/optimus_models/vocab/gpt2-vocab.json
+    merges_file: lib/model_zoo/optimus_models/vocab/gpt2-merges.txt
+optimus_vae:
+  super_cfg: optimus
+  type: optimus_vae
+  pth: pretrained/optimus-vae.pth
+  args:
+    encoder: MODEL(optimus_bert_encoder)
+    decoder: MODEL(optimus_gpt2_decoder)
+    tokenizer_encoder: MODEL(optimus_bert_tokenizer)
+    tokenizer_decoder: MODEL(optimus_gpt2_tokenizer)
+    args:
+      latent_size: 768

configs/model/sd.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+sd_base:
+  symbol: sd
+  find_unused_parameters: true
+sd_autoencoder:
+  type: autoencoderkl
+  args:
+    embed_dim: 4
+    monitor: val/rec_loss
+    ddconfig:
+      double_z: true
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [1, 2, 4, 4]
+      num_res_blocks: 2
+      attn_resolutions: []
+      dropout: 0.0
+    lossconfig:
+      target: torch.nn.Identity
+  pth: pretrained/kl-f8.pth
+sd_t2i:
+  super_cfg: sd_base
+  type: sd_t2i
+  args:
+    first_stage_config: MODEL(sd_autoencoder)
+    cond_stage_config: MODEL(clip_text_frozen)
+    unet_config: MODEL(openai_unet_sd)
+    beta_linear_start: 0.00085
+    beta_linear_end: 0.012
+    num_timesteps_cond: 1
+    timesteps: 1000
+    scale_factor: 0.18215
+    use_ema: true
+sd_t2i_noema:
+  super_cfg: sd
+  args:
+    use_ema: false
+#####################
+# sd with full clip #
+#####################
+sd_t2i_fullclip_backward_compatible:
+  super_cfg: sd_t2i
+  args:
+    cond_stage_config: MODEL(clip_frozen_encode_text_noproj)
+sd_t2i_fullclip_backward_compatible_noema:
+  super_cfg: sd_t2i_noema
+  args:
+    cond_stage_config: MODEL(clip_frozen_encode_text_noproj)
+sd_t2i_fullclip:
+  super_cfg: sd_t2i
+  args:
+    cond_stage_config: MODEL(clip_frozen_encode_text)
+sd_variation:
+  super_cfg: sd_t2i
+  type: sd_variation
+  args:
+    cond_stage_config: MODEL(clip_vision_frozen_justin)

configs/model/vd.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+# vd_base:
+#   symbol: vd
+#   find_unused_parameters: true
+############
+# vd basic #
+############
+vd_basic:
+  super_cfg: sd_t2i
+  type: vd_basic
+  symbol: vd
+  find_unused_parameters: true
+  args:
+    cond_stage_config: MODEL(clip_frozen_encode_vision)
+vd_basic_noema:
+  super_cfg: vd_basic
+  args:
+    use_ema: false
+###################
+# vd dual-context #
+###################
+vd_dc:
+  super_cfg: sd_t2i_fullclip
+  type: vd_dc
+  symbol: vd
+  find_unused_parameters: true
+  args:
+    unet_config: MODEL(openai_unet_dual_context)
+vd_dc_noema:
+  super_cfg: vd_dc
+  args:
+    use_ema: false
+######
+# vd #
+######
+vd:
+  type: vd
+  symbol: vd
+  find_unused_parameters: true
+  args:
+    autokl_cfg: MODEL(sd_autoencoder)
+    optimus_cfg: MODEL(optimus_vae)
+    clip_cfg: MODEL(clip_frozen)
+    unet_config: MODEL(openai_unet_vd)
+    beta_linear_start: 0.00085
+    beta_linear_end: 0.012
+    timesteps: 1000
+    scale_factor: 0.18215
+    use_ema: true
+vd_noema:
+  super_cfg: vd
+  args:
+    use_ema: false

lib/__init__.py ADDED Viewed

File without changes

lib/cfg_helper.py ADDED Viewed

	@@ -0,0 +1,664 @@

+import os
+import os.path as osp
+import shutil
+import copy
+import time
+import pprint
+import numpy as np
+import torch
+import matplotlib
+import argparse
+import json
+import yaml
+from easydict import EasyDict as edict
+from .model_zoo import get_model
+############
+# cfg_bank #
+############
+def cfg_solvef(cmd, root):
+    if not isinstance(cmd, str):
+        return cmd
+    if cmd.find('SAME')==0:
+        zoom = root
+        p = cmd[len('SAME'):].strip('()').split('.')
+        p = [pi.strip() for pi in p]
+        for pi in p:
+            try:
+                pi = int(pi)
+            except:
+                pass
+            try:
+                zoom = zoom[pi]
+            except:
+                return cmd
+        return cfg_solvef(zoom, root)
+    if cmd.find('SEARCH')==0:
+        zoom = root
+        p = cmd[len('SEARCH'):].strip('()').split('.')
+        p = [pi.strip() for pi in p]
+        find = True
+        # Depth first search
+        for pi in p:
+            try:
+                pi = int(pi)
+            except:
+                pass
+            try:
+                zoom = zoom[pi]
+            except:
+                find = False
+                break
+        if find:
+            return cfg_solvef(zoom, root)
+        else:
+            if isinstance(root, dict):
+                for ri in root:
+                    rv = cfg_solvef(cmd, root[ri])
+                    if rv != cmd:
+                        return rv
+            if isinstance(root, list):
+                for ri in root:
+                    rv = cfg_solvef(cmd, ri)
+                    if rv != cmd:
+                        return rv
+            return cmd
+    if cmd.find('MODEL')==0:
+        goto = cmd[len('MODEL'):].strip('()')
+        return model_cfg_bank()(goto)
+    if cmd.find('DATASET')==0:
+        goto = cmd[len('DATASET'):].strip('()')
+        return dataset_cfg_bank()(goto)
+    return cmd
+def cfg_solve(cfg, cfg_root):
+    # The function solve cfg element such that
+    #   all sorrogate input are settled.
+    #   (i.e. SAME(***) )
+    if isinstance(cfg, list):
+        for i in range(len(cfg)):
+            if isinstance(cfg[i], (list, dict)):
+                cfg[i] = cfg_solve(cfg[i], cfg_root)
+            else:
+                cfg[i] = cfg_solvef(cfg[i], cfg_root)
+    if isinstance(cfg, dict):
+        for k in cfg:
+            if isinstance(cfg[k], (list, dict)):
+                cfg[k] = cfg_solve(cfg[k], cfg_root)
+            else:
+                cfg[k] = cfg_solvef(cfg[k], cfg_root)
+    return cfg
+class model_cfg_bank(object):
+    def __init__(self):
+        self.cfg_dir = osp.join('configs', 'model')
+        self.cfg_bank = edict()
+    def __call__(self, name):
+        if name not in self.cfg_bank:
+            cfg_path = self.get_yaml_path(name)
+            with open(cfg_path, 'r') as f:
+                cfg_new = yaml.load(
+                    f, Loader=yaml.FullLoader)
+            cfg_new = edict(cfg_new)
+            self.cfg_bank.update(cfg_new)
+        cfg = self.cfg_bank[name]
+        cfg.name = name
+        if 'super_cfg' not in cfg:
+            cfg = cfg_solve(cfg, cfg)
+            self.cfg_bank[name] = cfg
+            return copy.deepcopy(cfg)
+        super_cfg = self.__call__(cfg.super_cfg)
+        # unlike other field,
+        # args will not be replaced but update.
+        if 'args' in cfg:
+            if 'args' in  super_cfg:
+                super_cfg.args.update(cfg.args)
+            else:
+                super_cfg.args = cfg.args
+            cfg.pop('args')
+        super_cfg.update(cfg)
+        super_cfg.pop('super_cfg')
+        cfg = super_cfg
+        try:
+            delete_args = cfg.pop('delete_args')
+        except:
+            delete_args = []
+        for dargs in delete_args:
+            cfg.args.pop(dargs)
+        cfg = cfg_solve(cfg, cfg)
+        self.cfg_bank[name] = cfg
+        return copy.deepcopy(cfg)
+    def get_yaml_path(self, name):
+        if name.find('ldm')==0:
+            return osp.join(
+                self.cfg_dir, 'ldm.yaml')
+        elif name.find('comodgan')==0:
+            return osp.join(
+                self.cfg_dir, 'comodgan.yaml')
+        elif name.find('stylegan')==0:
+            return osp.join(
+                self.cfg_dir, 'stylegan.yaml')
+        elif name.find('absgan')==0:
+            return osp.join(
+                self.cfg_dir, 'absgan.yaml')
+        elif name.find('ashgan')==0:
+            return osp.join(
+                self.cfg_dir, 'ashgan.yaml')
+        elif name.find('sr3')==0:
+            return osp.join(
+                self.cfg_dir, 'sr3.yaml')
+        elif name.find('specdiffsr')==0:
+            return osp.join(
+                self.cfg_dir, 'specdiffsr.yaml')
+        elif name.find('openai_unet')==0:
+            return osp.join(
+                self.cfg_dir, 'openai_unet.yaml')
+        elif name.find('clip')==0:
+            return osp.join(
+                self.cfg_dir, 'clip.yaml')
+        elif name.find('sd')==0:
+            return osp.join(
+                self.cfg_dir, 'sd.yaml')
+        elif name.find('vd')==0:
+            return osp.join(
+                self.cfg_dir, 'vd.yaml')
+        elif name.find('optimus')==0:
+            return osp.join(
+                self.cfg_dir, 'optimus.yaml')
+        else:
+            raise ValueError
+class dataset_cfg_bank(object):
+    def __init__(self):
+        self.cfg_dir = osp.join('configs', 'dataset')
+        self.cfg_bank = edict()
+    def __call__(self, name):
+        if name not in self.cfg_bank:
+            cfg_path = self.get_yaml_path(name)
+            with open(cfg_path, 'r') as f:
+                cfg_new = yaml.load(
+                    f, Loader=yaml.FullLoader)
+            cfg_new = edict(cfg_new)
+            self.cfg_bank.update(cfg_new)
+        cfg = self.cfg_bank[name]
+        cfg.name = name
+        if cfg.get('super_cfg', None) is None:
+            cfg = cfg_solve(cfg, cfg)
+            self.cfg_bank[name] = cfg
+            return copy.deepcopy(cfg)
+        super_cfg = self.__call__(cfg.super_cfg)
+        super_cfg.update(cfg)
+        cfg = super_cfg
+        cfg.super_cfg = None
+        try:
+            delete = cfg.pop('delete')
+        except:
+            delete = []
+        for dargs in delete:
+            cfg.pop(dargs)
+        cfg = cfg_solve(cfg, cfg)
+        self.cfg_bank[name] = cfg
+        return copy.deepcopy(cfg)
+    def get_yaml_path(self, name):
+        if name.find('cityscapes')==0:
+            return osp.join(
+                self.cfg_dir, 'cityscapes.yaml')
+        elif name.find('div2k')==0:
+            return osp.join(
+                self.cfg_dir, 'div2k.yaml')
+        elif name.find('gandiv2k')==0:
+            return osp.join(
+                self.cfg_dir, 'gandiv2k.yaml')
+        elif name.find('srbenchmark')==0:
+            return osp.join(
+                self.cfg_dir, 'srbenchmark.yaml')
+        elif name.find('imagedir')==0:
+            return osp.join(
+                self.cfg_dir, 'imagedir.yaml')
+        elif name.find('places2')==0:
+            return osp.join(
+                self.cfg_dir, 'places2.yaml')
+        elif name.find('ffhq')==0:
+            return osp.join(
+                self.cfg_dir, 'ffhq.yaml')
+        elif name.find('imcpt')==0:
+            return osp.join(
+                self.cfg_dir, 'imcpt.yaml')
+        elif name.find('texture')==0:
+            return osp.join(
+                self.cfg_dir, 'texture.yaml')
+        elif name.find('openimages')==0:
+            return osp.join(
+                self.cfg_dir, 'openimages.yaml')
+        elif name.find('laion2b')==0:
+            return osp.join(
+                self.cfg_dir, 'laion2b.yaml')
+        elif name.find('laionart')==0:
+            return osp.join(
+                self.cfg_dir, 'laionart.yaml')
+        elif name.find('celeba')==0:
+            return osp.join(
+                self.cfg_dir, 'celeba.yaml')
+        elif name.find('coyo')==0:
+            return osp.join(
+                self.cfg_dir, 'coyo.yaml')
+        elif name.find('pafc')==0:
+            return osp.join(
+                self.cfg_dir, 'pafc.yaml')
+        elif name.find('coco')==0:
+            return osp.join(
+                self.cfg_dir, 'coco.yaml')
+        else:
+            raise ValueError
+class experiment_cfg_bank(object):
+    def __init__(self):
+        self.cfg_dir = osp.join('configs', 'experiment')
+        self.cfg_bank = edict()
+    def __call__(self, name):
+        if name not in self.cfg_bank:
+            cfg_path = self.get_yaml_path(name)
+            with open(cfg_path, 'r') as f:
+                cfg = yaml.load(
+                    f, Loader=yaml.FullLoader)
+            cfg = edict(cfg)
+        cfg = cfg_solve(cfg, cfg)
+        cfg = cfg_solve(cfg, cfg)
+        # twice for SEARCH
+        self.cfg_bank[name] = cfg
+        return copy.deepcopy(cfg)
+    def get_yaml_path(self, name):
+        return osp.join(
+            self.cfg_dir, name+'.yaml')
+def load_cfg_yaml(path):
+    if osp.isfile(path):
+        cfg_path = path
+    elif osp.isfile(osp.join('configs', 'experiment', path)):
+        cfg_path = osp.join('configs', 'experiment', path)
+    elif osp.isfile(osp.join('configs', 'experiment', path+'.yaml')):
+        cfg_path = osp.join('configs', 'experiment', path+'.yaml')
+    else:
+        assert False, 'No such config!'
+    with open(cfg_path, 'r') as f:
+        cfg = yaml.load(f, Loader=yaml.FullLoader)
+        cfg = edict(cfg)
+    cfg = cfg_solve(cfg, cfg)
+    cfg = cfg_solve(cfg, cfg)
+    return cfg
+##############
+# cfg_helper #
+##############
+def get_experiment_id(ref=None):
+    if ref is None:
+        time.sleep(0.5)
+        return int(time.time()*100)
+    else:
+        try:
+            return int(ref)
+        except:
+            pass
+        _, ref = osp.split(ref)
+        ref = ref.split('_')[0]
+        try:
+            return int(ref)
+        except:
+            assert False, 'Invalid experiment ID!'
+def record_resume_cfg(path):
+    cnt = 0
+    while True:
+        if osp.exists(path+'.{:04d}'.format(cnt)):
+            cnt += 1
+            continue
+        shutil.copyfile(path, path+'.{:04d}'.format(cnt))
+        break
+def get_command_line_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--debug', action='store_true', default=False)
+    parser.add_argument('--config', type=str)
+    parser.add_argument('--gpu', nargs='+', type=int)
+    parser.add_argument('--node_rank', type=int, default=0)
+    parser.add_argument('--nodes', type=int, default=1)
+    parser.add_argument('--addr', type=str, default='127.0.0.1')
+    parser.add_argument('--port', type=int, default=11233)
+    parser.add_argument('--signature', nargs='+', type=str)
+    parser.add_argument('--seed', type=int)
+    parser.add_argument('--eval', type=str)
+    parser.add_argument('--eval_subdir', type=str)
+    parser.add_argument('--pretrained', type=str)
+    parser.add_argument('--resume_dir', type=str)
+    parser.add_argument('--resume_step', type=int)
+    parser.add_argument('--resume_weight', type=str)
+    args = parser.parse_args()
+    # Special handling the resume
+    if args.resume_dir is not None:
+        cfg = edict()
+        cfg.env = edict()
+        cfg.env.debug = args.debug
+        cfg.env.resume = edict()
+        cfg.env.resume.dir = args.resume_dir
+        cfg.env.resume.step = args.resume_step
+        cfg.env.resume.weight = args.resume_weight
+        return cfg
+    cfg = load_cfg_yaml(args.config)
+    cfg.env.debug = args.debug
+    cfg.env.gpu_device = [0] if args.gpu is None else list(args.gpu)
+    cfg.env.master_addr = args.addr
+    cfg.env.master_port = args.port
+    cfg.env.dist_url = 'tcp://{}:{}'.format(args.addr, args.port)
+    cfg.env.node_rank = args.node_rank
+    cfg.env.nodes = args.nodes
+    istrain = False if args.eval is not None else True
+    isdebug = cfg.env.debug
+    if istrain:
+        if isdebug:
+            cfg.env.experiment_id = 999999999999
+            cfg.train.signature = ['debug']
+        else:
+            cfg.env.experiment_id = get_experiment_id()
+            if args.signature is not None:
+                cfg.train.signature = args.signature
+    else:
+        if 'train' in cfg:
+            cfg.pop('train')
+        cfg.env.experiment_id = get_experiment_id(args.eval)
+        if args.signature is not None:
+            cfg.eval.signature = args.signature
+        if isdebug and (args.eval is None):
+            cfg.env.experiment_id = 999999999999
+            cfg.eval.signature = ['debug']
+        if args.eval_subdir is not None:
+            if isdebug:
+                cfg.eval.eval_subdir = 'debug'
+            else:
+                cfg.eval.eval_subdir = args.eval_subdir
+        if args.pretrained is not None:
+            cfg.eval.pretrained = args.pretrained
+          # The override pretrained over the setting in cfg.model
+    if args.seed is not None:
+        cfg.env.rnd_seed = args.seed
+    return cfg
+def cfg_initiates(cfg):
+    cfge = cfg.env
+    isdebug = cfge.debug
+    isresume = 'resume' in cfge
+    istrain = 'train' in cfg
+    haseval = 'eval' in cfg
+    cfgt = cfg.train if istrain else None
+    cfgv = cfg.eval if haseval else None
+    ###############################
+    # get some environment params #
+    ###############################
+    cfge.computer = os.uname()
+    cfge.torch_version = str(torch.__version__)
+    ##########
+    # resume #
+    ##########
+    if isresume:
+        resume_cfg_path = osp.join(cfge.resume.dir, 'config.yaml')
+        record_resume_cfg(resume_cfg_path)
+        with open(resume_cfg_path, 'r') as f:
+            cfg_resume = yaml.load(f, Loader=yaml.FullLoader)
+        cfg_resume = edict(cfg_resume)
+        cfg_resume.env.update(cfge)
+        cfg = cfg_resume
+        cfge = cfg.env
+        log_file = cfg.train.log_file
+        print('')
+        print('##########')
+        print('# resume #')
+        print('##########')
+        print('')
+        with open(log_file, 'a') as f:
+            print('', file=f)
+            print('##########', file=f)
+            print('# resume #', file=f)
+            print('##########', file=f)
+            print('', file=f)
+        pprint.pprint(cfg)
+        with open(log_file, 'a') as f:
+            pprint.pprint(cfg, f)
+    ####################
+    # node distributed #
+    ####################
+    if cfg.env.master_addr!='127.0.0.1':
+        os.environ['MASTER_ADDR'] = cfge.master_addr
+        os.environ['MASTER_PORT'] = '{}'.format(cfge.master_port)
+        if cfg.env.dist_backend=='nccl':
+            os.environ['NCCL_SOCKET_FAMILY'] = 'AF_INET'
+        if cfg.env.dist_backend=='gloo':
+            os.environ['GLOO_SOCKET_FAMILY'] = 'AF_INET'
+    #######################
+    # cuda visible device #
+    #######################
+    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
+        [str(gid) for gid in cfge.gpu_device])
+    #####################
+    # return resume cfg #
+    #####################
+    if isresume:
+        return cfg
+    #############################################
+    # some misc setting that not need in resume #
+    #############################################
+    cfgm = cfg.model
+    cfge.gpu_count = len(cfge.gpu_device)
+    ##########################################
+    # align batch size and num worker config #
+    ##########################################
+    gpu_n = cfge.gpu_count * cfge.nodes
+    def align_batch_size(bs, bs_per_gpu):
+        assert (bs is not None) or (bs_per_gpu is not None)
+        bs = bs_per_gpu * gpu_n if bs is None else bs
+        bs_per_gpu = bs // gpu_n if bs_per_gpu is None else bs_per_gpu
+        assert (bs == bs_per_gpu * gpu_n)
+        return bs, bs_per_gpu
+    if istrain:
+        cfgt.batch_size, cfgt.batch_size_per_gpu = \
+            align_batch_size(cfgt.batch_size, cfgt.batch_size_per_gpu)
+        cfgt.dataset_num_workers, cfgt.dataset_num_workers_per_gpu = \
+            align_batch_size(cfgt.dataset_num_workers, cfgt.dataset_num_workers_per_gpu)
+    if haseval:
+        cfgv.batch_size, cfgv.batch_size_per_gpu = \
+            align_batch_size(cfgv.batch_size, cfgv.batch_size_per_gpu)
+        cfgv.dataset_num_workers, cfgv.dataset_num_workers_per_gpu = \
+            align_batch_size(cfgv.dataset_num_workers, cfgv.dataset_num_workers_per_gpu)
+    ##################
+    # create log dir #
+    ##################
+    if istrain:
+        if not isdebug:
+            sig = cfgt.get('signature', [])
+            version = get_model().get_version(cfgm.type)
+            sig = sig + ['v{}'.format(version), 's{}'.format(cfge.rnd_seed)]
+        else:
+            sig = ['debug']
+        log_dir = [
+            cfge.log_root_dir,
+            '{}_{}'.format(cfgm.symbol, cfgt.dataset.symbol),
+            '_'.join([str(cfge.experiment_id)] + sig)
+        ]
+        log_dir = osp.join(*log_dir)
+        log_file = osp.join(log_dir, 'train.log')
+        if not osp.exists(log_file):
+            os.makedirs(osp.dirname(log_file))
+        cfgt.log_dir = log_dir
+        cfgt.log_file = log_file
+        if haseval:
+            cfgv.log_dir = log_dir
+            cfgv.log_file = log_file
+    else:
+        model_symbol = cfgm.symbol
+        if cfgv.get('dataset', None) is None:
+            dataset_symbol = 'nodataset'
+        else:
+            dataset_symbol = cfgv.dataset.symbol
+        log_dir = osp.join(cfge.log_root_dir, '{}_{}'.format(model_symbol, dataset_symbol))
+        exp_dir = search_experiment_folder(log_dir, cfge.experiment_id)
+        if exp_dir is None:
+            if not isdebug:
+                sig = cfgv.get('signature', []) + ['evalonly']
+            else:
+                sig = ['debug']
+            exp_dir = '_'.join([str(cfge.experiment_id)] + sig)
+        eval_subdir = cfgv.get('eval_subdir', None)
+        # override subdir in debug mode (if eval_subdir is set)
+        eval_subdir = 'debug' if (eval_subdir is not None) and isdebug else eval_subdir
+        if eval_subdir is not None:
+            log_dir = osp.join(log_dir, exp_dir, eval_subdir)
+        else:
+            log_dir = osp.join(log_dir, exp_dir)
+        disable_log_override = cfgv.get('disable_log_override', False)
+        if osp.isdir(log_dir):
+            if disable_log_override:
+                assert False, 'Override an exsited log_dir is disabled at [{}]'.format(log_dir)
+        else:
+            os.makedirs(log_dir)
+        log_file = osp.join(log_dir, 'eval.log')
+        cfgv.log_dir = log_dir
+        cfgv.log_file = log_file
+    ######################
+    # print and save cfg #
+    ######################
+    pprint.pprint(cfg)
+    with open(log_file, 'w') as f:
+        pprint.pprint(cfg, f)
+    with open(osp.join(log_dir, 'config.yaml'), 'w') as f:
+        yaml.dump(edict_2_dict(cfg), f)
+    #############
+    # save code #
+    #############
+    save_code = False
+    if istrain:
+        save_code = cfgt.get('save_code', False)
+    elif haseval:
+        save_code = cfgv.get('save_code', False)
+    if save_code:
+        codedir = osp.join(log_dir, 'code')
+        if osp.exists(codedir):
+            shutil.rmtree(codedir)
+        for d in ['configs', 'lib']:
+            fromcodedir = d
+            tocodedir = osp.join(codedir, d)
+            shutil.copytree(
+                fromcodedir, tocodedir,
+                ignore=shutil.ignore_patterns(
+                    '*__pycache__*', '*build*'))
+        for codei in os.listdir('.'):
+            if osp.splitext(codei)[1] == 'py':
+                shutil.copy(codei, codedir)
+    #######################
+    # set matplotlib mode #
+    #######################
+    if 'matplotlib_mode' in cfge:
+        try:
+            matplotlib.use(cfge.matplotlib_mode)
+        except:
+            print('Warning: matplotlib mode [{}] failed to be set!'.format(cfge.matplotlib_mode))
+    return cfg
+def edict_2_dict(x):
+    if isinstance(x, dict):
+        xnew = {}
+        for k in x:
+            xnew[k] = edict_2_dict(x[k])
+        return xnew
+    elif isinstance(x, list):
+        xnew = []
+        for i in range(len(x)):
+            xnew.append( edict_2_dict(x[i]) )
+        return xnew
+    else:
+        return x
+def search_experiment_folder(root, exid):
+    target = None
+    for fi in os.listdir(root):
+        if not osp.isdir(osp.join(root, fi)):
+            continue
+        if int(fi.split('_')[0]) == exid:
+            if target is not None:
+                return None # duplicated
+            elif target is None:
+                target = fi
+    return target

lib/cfg_holder.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import copy
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+##############
+# cfg_holder #
+##############
+@singleton
+class cfg_unique_holder(object):
+    def __init__(self):
+        self.cfg = None
+        # this is use to track the main codes.
+        self.code = set()
+    def save_cfg(self, cfg):
+        self.cfg = copy.deepcopy(cfg)
+    def add_code(self, code):
+        """
+        A new main code is reached and
+            its name is added.
+        """
+        self.code.add(code)

lib/data_factory/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .common.ds_base import collate, get_dataset
+from .common.ds_loader import get_loader
+from .common.ds_transform import get_transform
+from .common.ds_estimator import get_estimator
+from .common.ds_formatter import get_formatter
+from .common.ds_sampler import get_sampler

lib/data_factory/common/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .ds_base import ds_base, collate, register as regdataset
+from .ds_loader import pre_loader_checkings, register as regloader
+from .ds_transform import TBase, have, register as regtrans
+from .ds_estimator import register as regestmat
+from .ds_formatter import register as regformat
+from .ds_sampler import register as regsampler

lib/data_factory/common/ds_base.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import os
+import os.path as osp
+import numpy as np
+import numpy.random as npr
+import torch
+import torch.distributed as dist
+import torchvision
+import copy
+import itertools
+from ... import sync
+from ...cfg_holder import cfg_unique_holder as cfguh
+from ...log_service import print_log
+import torch.distributed as dist
+from multiprocessing import shared_memory
+import pickle
+import hashlib
+import random
+class ds_base(torch.utils.data.Dataset):
+    def __init__(self,
+                 cfg,
+                 loader = None,
+                 estimator = None,
+                 transforms = None,
+                 formatter = None):
+        self.cfg = cfg
+        self.load_info = None
+        self.init_load_info()
+        self.loader = loader
+        self.transforms = transforms
+        self.formatter = formatter
+        if self.load_info is not None:
+            load_info_order_by = getattr(self.cfg, 'load_info_order_by', 'default')
+            if load_info_order_by == 'default':
+                self.load_info = sorted(self.load_info, key=lambda x:x['unique_id'])
+            else:
+                try:
+                    load_info_order_by, reverse = load_info_order_by.split('|')
+                    reverse = reverse == 'reverse'
+                except:
+                    reverse = False
+                self.load_info = sorted(
+                    self.load_info, key=lambda x:x[load_info_order_by], reverse=reverse)
+        load_info_add_idx = getattr(self.cfg, 'load_info_add_idx', True)
+        if (self.load_info is not None) and load_info_add_idx:
+            for idx, info in enumerate(self.load_info):
+                info['idx'] = idx
+        if estimator is not None:
+            self.load_info = estimator(self.load_info)
+        self.try_sample = getattr(self.cfg, 'try_sample', None)
+        if self.try_sample is not None:
+            try:
+                start, end = self.try_sample
+            except:
+                start, end = 0, self.try_sample
+            self.load_info = self.load_info[start:end]
+        self.repeat = getattr(self.cfg, 'repeat', 1)
+        pick = getattr(self.cfg, 'pick', None)
+        if pick is not None:
+            self.load_info = [i for i in self.load_info if i['filename'] in pick]
+        #########
+        # cache #
+        #########
+        self.cache_sm = getattr(self.cfg, 'cache_sm', False)
+        self.cache_cnt = 0
+        if self.cache_sm:
+            self.cache_pct = getattr(self.cfg, 'cache_pct', 0)
+            cache_unique_id = sync.nodewise_sync().random_sync_id()
+            self.cache_unique_id = hashlib.sha256(pickle.dumps(cache_unique_id)).hexdigest()
+            self.__cache__(self.cache_pct)
+        #######
+        # log #
+        #######
+        if self.load_info is not None:
+            console_info = '{}: '.format(self.__class__.__name__)
+            console_info += 'total {} unique images, '.format(len(self.load_info))
+            console_info += 'total {} unique sample. Cached {}. Repeat {} times.'.format(
+                len(self.load_info), self.cache_cnt, self.repeat)
+        else:
+            console_info = '{}: load_info not ready.'.format(self.__class__.__name__)
+        print_log(console_info)
+    def init_load_info(self):
+        # implement by sub class
+        pass
+    def __len__(self):
+        return len(self.load_info)*self.repeat
+    def __cache__(self, pct):
+        if pct == 0:
+            self.cache_cnt = 0
+            return
+        self.cache_cnt = int(len(self.load_info)*pct)
+        if not self.cache_sm:
+            for i in range(self.cache_cnt):
+                self.load_info[i] = self.loader(self.load_info[i])
+            return
+        for i in range(self.cache_cnt):
+            shm_name = str(self.load_info[i]['unique_id']) + '_' + self.cache_unique_id
+            if i % self.local_world_size == self.local_rank:
+                data = pickle.dumps(self.loader(self.load_info[i]))
+                datan = len(data)
+                # self.print_smname_to_file(shm_name)
+                shm = shared_memory.SharedMemory(
+                    name=shm_name, create=True, size=datan)
+                shm.buf[0:datan] = data[0:datan]
+                shm.close()
+                self.load_info[i] = shm_name
+            else:
+                self.load_info[i] = shm_name
+        dist.barrier()
+    def __getitem__(self, idx):
+        idx = idx%len(self.load_info)
+        # element = copy.deepcopy(self.load_info[idx])
+        # 0730 try shared memory
+        element = copy.deepcopy(self.load_info[idx])
+        if isinstance(element, str):
+            shm = shared_memory.SharedMemory(name=element)
+            element = pickle.loads(shm.buf)
+            shm.close()
+        else:
+            element = copy.deepcopy(element)
+            element['load_info_ptr'] = self.load_info
+        if idx >= self.cache_cnt:
+            element = self.loader(element)
+        if self.transforms is not None:
+            element = self.transforms(element)
+        if self.formatter is not None:
+            return self.formatter(element)
+        else:
+            return element
+    # 0730 try shared memory
+    def __del__(self):
+        # Clean the shared memory
+        for infoi in self.load_info:
+            if isinstance(infoi, str) and (self.local_rank==0):
+                shm = shared_memory.SharedMemory(name=infoi)
+                shm.close()
+                shm.unlink()
+    def print_smname_to_file(self, smname):
+        try:
+            log_file = cfguh().cfg.train.log_file
+        except:
+            try:
+                log_file = cfguh().cfg.eval.log_file
+            except:
+                raise ValueError
+        # a trick to use the log_file path
+        sm_file = log_file.replace('.log', '.smname')
+        with open(sm_file, 'a') as f:
+            f.write(smname + '\n')
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+from .ds_loader import get_loader
+from .ds_transform import get_transform
+from .ds_estimator import get_estimator
+from .ds_formatter import get_formatter
+@singleton
+class get_dataset(object):
+    def __init__(self):
+        self.dataset = {}
+    def register(self, ds):
+        self.dataset[ds.__name__] = ds
+    def __call__(self, cfg):
+        if cfg is None:
+            return None
+        t = cfg.type
+        if t is None:
+            return None
+        elif t in ['laion2b', 'laion2b_dummy',
+                   'laion2b_webdataset',
+                   'laion2b_webdataset_sdofficial', ]:
+            from .. import ds_laion2b
+        elif t in ['coyo', 'coyo_dummy',
+                   'coyo_webdataset', ]:
+            from .. import ds_coyo_webdataset
+        elif t in ['laionart', 'laionart_dummy',
+                   'laionart_webdataset', ]:
+            from .. import ds_laionart
+        elif t in ['celeba']:
+            from .. import ds_celeba
+        elif t in ['div2k']:
+            from .. import ds_div2k
+        elif t in ['pafc']:
+            from .. import ds_pafc
+        elif t in ['coco_caption']:
+            from .. import ds_coco
+        else:
+            raise ValueError
+        loader    = get_loader()   (cfg.get('loader'   , None))
+        transform = get_transform()(cfg.get('transform', None))
+        estimator = get_estimator()(cfg.get('estimator', None))
+        formatter = get_formatter()(cfg.get('formatter', None))
+        return self.dataset[t](
+            cfg, loader, estimator,
+            transform, formatter)
+def register():
+    def wrapper(class_):
+        get_dataset().register(class_)
+        return class_
+    return wrapper
+# some other helpers
+class collate(object):
+    """
+        Modified from torch.utils.data._utils.collate
+        It handle list different from the default.
+            List collate just by append each other.
+    """
+    def __init__(self):
+        self.default_collate = \
+            torch.utils.data._utils.collate.default_collate
+    def __call__(self, batch):
+        """
+        Args:
+            batch: [data, data] -or- [(data1, data2, ...), (data1, data2, ...)]
+        This function will not be used as induction function
+        """
+        elem = batch[0]
+        if not (elem, (tuple, list)):
+            return self.default_collate(batch)
+        rv = []
+        # transposed
+        for i in zip(*batch):
+            if isinstance(i[0], list):
+                if len(i[0]) != 1:
+                    raise ValueError
+                try:
+                    i = [[self.default_collate(ii).squeeze(0)] for ii in i]
+                except:
+                    pass
+                rvi = list(itertools.chain.from_iterable(i))
+                rv.append(rvi) # list concat
+            else:
+                rv.append(self.default_collate(i))
+        return rv

lib/data_factory/common/ds_estimator.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os.path as osp
+import numpy as np
+import numpy.random as npr
+import PIL
+import torch
+import torchvision
+import xml.etree.ElementTree as ET
+import json
+import copy
+import math
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+@singleton
+class get_estimator(object):
+    def __init__(self):
+        self.estimator = {}
+    def register(self, estimf):
+        self.estimator[estimf.__name__] = estimf
+    def __call__(self, cfg):
+        if cfg is None:
+            return None
+        t = cfg.type
+        return self.estimator[t](**cfg.args)
+def register():
+    def wrapper(class_):
+        get_estimator().register(class_)
+        return class_
+    return wrapper

lib/data_factory/common/ds_formatter.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import os.path as osp
+import numpy as np
+import numpy.random as npr
+import torch
+from PIL import Image
+import copy
+import gc
+import itertools
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+@singleton
+class get_formatter(object):
+    def __init__(self):
+        self.formatter = {}
+    def register(self, formatf):
+        self.formatter[formatf.__name__] = formatf
+    def __call__(self, cfg):
+        if cfg is None:
+            return None
+        t = cfg.type
+        return self.formatter[t](**cfg.args)
+def register():
+    def wrapper(class_):
+        get_formatter().register(class_)
+        return class_
+    return wrapper

lib/data_factory/common/ds_loader.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os.path as osp
+import numpy as np
+import numpy.random as npr
+import PIL
+import torch
+import torchvision
+import xml.etree.ElementTree as ET
+import json
+import copy
+from ...cfg_holder import cfg_unique_holder as cfguh
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+@singleton
+class get_loader(object):
+    def __init__(self):
+        self.loader = {}
+    def register(self, loadf):
+        self.loader[loadf.__name__] = loadf
+    def __call__(self, cfg):
+        if cfg is None:
+            return None
+        if isinstance(cfg, list):
+            loader = []
+            for ci in cfg:
+                t = ci.type
+                loader.append(self.loader[t](**ci.args))
+            return compose(loader)
+        t = cfg.type
+        return self.loader[t](**cfg.args)
+class compose(object):
+    def __init__(self, loaders):
+        self.loaders = loaders
+    def __call__(self, element):
+        for l in self.loaders:
+            element = l(element)
+        return element
+    def __getitem__(self, idx):
+        return self.loaders[idx]
+def register():
+    def wrapper(class_):
+        get_loader().register(class_)
+        return class_
+    return wrapper
+def pre_loader_checkings(ltype):
+    lpath = ltype+'_path'
+    # cache feature added on 20201021
+    lcache = ltype+'_cache'
+    def wrapper(func):
+        def inner(self, element):
+            if lcache in element:
+                # cache feature added on 20201021
+                data = element[lcache]
+            else:
+                if ltype in element:
+                    raise ValueError
+                if lpath not in element:
+                    raise ValueError
+                if element[lpath] is None:
+                    data = None
+                else:
+                    data = func(self, element[lpath], element)
+            element[ltype] = data
+            if ltype == 'image':
+                if isinstance(data, np.ndarray):
+                    imsize = data.shape[-2:]
+                elif isinstance(data, PIL.Image.Image):
+                    imsize = data.size[::-1]
+                elif isinstance(data, torch.Tensor):
+                    imsize = [data.size(-2), data.size(-1)]
+                elif data is None:
+                    imsize = None
+                else:
+                    raise ValueError
+                element['imsize'] = imsize
+                element['imsize_current'] = copy.deepcopy(imsize)
+            return element
+        return inner
+    return wrapper

lib/data_factory/common/ds_sampler.py ADDED Viewed

	@@ -0,0 +1,273 @@

+from tokenize import group
+import torch
+import numpy as np
+import numpy.random as npr
+import torch.distributed as dist
+import math
+from ...log_service import print_log
+from ... import sync
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+@singleton
+class get_sampler(object):
+    def __init__(self):
+        self.sampler = {}
+    def register(self, sampler):
+        self.sampler[sampler.__name__] = sampler
+    def __call__(self, dataset, cfg):
+        if cfg == 'default_train':
+            return GlobalDistributedSampler(dataset, shuffle=True, extend=False)
+        elif cfg == 'default_eval':
+            return GlobalDistributedSampler(dataset, shuffle=False, extend=True)
+        else:
+            t = cfg.type
+            return self.sampler[t](dataset=dataset, **cfg.args)
+def register():
+    def wrapper(class_):
+        get_sampler().register(class_)
+        return class_
+    return wrapper
+######################
+# DistributedSampler #
+######################
+@register()
+class GlobalDistributedSampler(torch.utils.data.Sampler):
+    """
+    This is a distributed sampler that sync accross gpus and nodes.
+    """
+    def __init__(self,
+                 dataset,
+                 shuffle=True,
+                 extend=False,):
+        """
+        Arguments:
+            dataset: Dataset used for sampling.
+            shuffle: If true, sampler will shuffle the indices
+            extend: If true, sampler will extend the indices that can be even distributed by ranks
+                otherwise sampler will truncate the indices to make it even.
+        """
+        self.ddp = sync.is_ddp()
+        self.rank = sync.get_rank('global')
+        self.world_size = sync.get_world_size('global')
+        self.dataset = dataset
+        self.shuffle = shuffle
+        self.extend = extend
+        num_samples = len(dataset) // self.world_size
+        if extend and (len(dataset)%self.world_size != 0):
+            num_samples+=1
+        self.num_samples = num_samples
+        self.total_size = num_samples * self.world_size
+    def __iter__(self):
+        indices = self.get_sync_order()
+        if self.extend:
+            # extend using the front indices
+            indices = indices+indices[0:self.total_size-len(indices)]
+        else:
+            # truncate
+            indices = indices[0:self.total_size]
+        # subsample
+        indices = indices[self.rank : len(indices) : self.world_size]
+        return iter(indices)
+    def __len__(self):
+        return self.num_samples
+    def get_sync_order(self):
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset)).to(self.rank)
+            if self.ddp:
+                dist.broadcast(indices, src=0)
+            indices = indices.to('cpu').tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+        print_log('Sampler : {}'.format(str(indices[0:5])) )
+        return indices
+@register()
+class LocalDistributedSampler(GlobalDistributedSampler):
+    """
+    This is a distributed sampler that sync across gpus within the nodes.
+        But not sync across nodes.
+    """
+    def __init__(self,
+                 dataset,
+                 shuffle=True,
+                 extend=False,):
+        super().__init__(dataset, shuffle, extend)
+        self.rank = sync.get_rank('local')
+        self.world_size = sync.get_world_size('local')
+    def get_sync_order(self):
+        if self.shuffle:
+            if self.rank == 0:
+                indices = list(npr.permutation(len(self.dataset)))
+                sync.nodewise_sync().broadcast_r0(indices)
+            else:
+                indices = sync.nodewise_sync().broadcast_r0(None)
+        else:
+            indices = list(range(len(self.dataset)))
+        print_log('Sampler : {}'.format(str(indices[0:5])) )
+        return indices
+############################
+# random sample with group #
+############################
+# Deprecated
+@register()
+class GroupSampler(torch.utils.data.Sampler):
+    """
+    This is a new DistributedSampler that sample all index according to group.
+    i.e.
+    if group_size=3, num_replicas=2, train mode:
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
+            ==> (group) [0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10]
+            ==> (distribute) process0: [3, 4, 5], (leftover [6, 7, 8, 9, 10])
+                             process1: [0, 1, 2]
+            ==> (group leftover) process0: [3, 4, 5], (leftover [6, 7], [8, 9], 10)
+                                 process1: [0, 1, 2]
+            ==> (distribute) process0: [3, 4, 5], [6, 7] (remove 10)
+                             process1: [0, 1, 2], [8, 9]
+        it will avoid_batchsize=1:
+        0, 1, 2, 3, 4, 5, 6, 7, 8,
+            ==> (group) [0, 1, 2], [3, 4, 5], [6, 7, 8]
+            ==> (distribute) process0: [3, 4, 5], (leftover [6, 7, 8])
+                             process1: [0, 1, 2]
+            ==> (group leftover) process0: [3, 4, 5], (leftover [6], [7], [8])
+                                 process1: [0, 1, 2]
+            ==> (distribute) process0: [3, 4, 5], (remove 6, 7, 8) (because distribute make batchsize 1)
+                             process1: [0, 1, 2]
+    if group_size=3, num_replicas=2, eval mode:
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
+            ==> (extend) 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10
+            ==> (group) [0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 10]
+            ==> (distribute) process0: [0, 1, 2], [6, 7, 8],
+                             process1: [3, 4, 5], [9, 10, 10]
+    """
+    def __init__(self,
+                 dataset,
+                 group_size,
+                 num_replicas=None,
+                 rank=None,
+                 mode='train',):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise ValueError
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise ValueError
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.len_dataset = len(dataset)
+        self.group_size = group_size
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.mode = mode
+        len_dataset = self.len_dataset
+        if (len_dataset % num_replicas != 0) and (mode == 'train'):
+            # drop the non_aligned
+            aligned_indices = np.arange(len_dataset)[:-(len_dataset % num_replicas)]
+            aligned_len_dataset = aligned_indices.shape[0]
+        elif (len_dataset % num_replicas != 0) and (mode == 'eval'):
+            extend = np.array([len_dataset-1 for _ in range(num_replicas - len_dataset % num_replicas)])
+            aligned_indices = np.concatenate([range(len_dataset), extend])
+            aligned_len_dataset = aligned_indices.shape[0]
+        else:
+            aligned_indices = np.arange(len_dataset)
+            aligned_len_dataset = len_dataset
+        num_even_distributed_groups = aligned_len_dataset // (group_size * num_replicas)
+        num_even = num_even_distributed_groups * group_size * num_replicas
+        self.regular_groups = aligned_indices[0:num_even].reshape(-1, group_size)
+        self.leftover_groups = aligned_indices[num_even:].reshape(num_replicas, -1)
+        if self.leftover_groups.size == 0:
+            self.leftover_groups = None
+        elif (self.leftover_groups.shape[-1]==1) and (mode == 'train'):
+            # avoid bs=1
+            self.leftover_groups = None
+        # a urly way to modify dataset.load_info according to the grouping
+        for groupi in self.regular_groups:
+            for idx in groupi:
+                idx_lowerbd = groupi[0]
+                idx_upperbd = groupi[-1]
+                idx_reference = (idx_lowerbd+idx_upperbd)//2
+                dataset.load_info[idx]['ref_size'] = dataset.load_info[idx_reference]['image_size']
+        if self.leftover_groups is not None:
+            for groupi in self.leftover_groups:
+                for idx in groupi:
+                    idx_lowerbd = groupi[0]
+                    idx_upperbd = groupi[-1]
+                    idx_reference = (idx_lowerbd+idx_upperbd)//2
+                    dataset.load_info[idx]['ref_size'] = dataset.load_info[idx_reference]['image_size']
+    def concat(self, nparrays, axis=0):
+        # a helper for save concaternation
+        nparrays = [i for i in nparrays if i.size > 0]
+        return np.concatenate(nparrays, axis=axis)
+    def __iter__(self):
+        indices = self.get_sync_order()
+        return iter(indices)
+    def __len__(self):
+        return self.num_samples
+    def get_sync_order(self):
+        # g = torch.Generator()
+        # g.manual_seed(self.epoch)
+        mode         = self.mode
+        rank         = self.rank
+        num_replicas = self.num_replicas
+        group_size   = self.group_size
+        num_groups = len(self.regular_groups)
+        if mode == 'train':
+            g_indices = torch.randperm(num_groups).to(rank)
+            dist.broadcast(g_indices, src=0)
+            g_indices = g_indices.to('cpu').tolist()
+            num_groups_per_rank = num_groups // num_replicas
+            groups = self.regular_groups[g_indices][num_groups_per_rank*rank : num_groups_per_rank*(rank+1)]
+            indices = groups.flatten()
+            if self.leftover_groups is not None:
+                leftg_indices = torch.randperm(len(self.leftover_groups)).to(rank)
+                dist.broadcast(leftg_indices, src=0)
+                leftg_indices = leftg_indices.to('cpu').tolist()
+                last = self.leftover_groups[leftg_indices][rank]
+                indices = np.concatenate([indices, last], axis=0)
+        elif mode == 'eval':
+            groups = self.regular_groups.reshape(-1, num_replicas, group_size)[:, rank, :]
+            indices = groups.flatten()
+            if self.leftover_groups is not None:
+                last = self.leftover_groups[rank]
+                indices = np.concatenate([indices, last], axis=0)
+        else:
+            raise ValueError
+        print_log('Sampler RANK {} : {}'.format(rank, str(indices[0:group_size+1])))
+        return indices

lib/data_factory/common/ds_transform.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os.path as osp
+import numpy as np
+import numpy.random as npr
+import PIL
+import torch
+import torchvision
+import xml.etree.ElementTree as ET
+import json
+import copy
+import math
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+@singleton
+class get_transform(object):
+    def __init__(self):
+        self.transform = {}
+    def register(self, transf):
+        self.transform[transf.__name__] = transf
+    def __call__(self, cfg):
+        if cfg is None:
+            return None
+        if isinstance(cfg, list):
+            loader = []
+            for ci in cfg:
+                t = ci.type
+                loader.append(self.transform[t](**ci.args))
+            return compose(loader)
+        t = cfg.type
+        return self.transform[t](**cfg.args)
+def register():
+    def wrapper(class_):
+        get_transform().register(class_)
+        return class_
+    return wrapper
+def have(must=[], may=[]):
+    """
+    The nextgen decorator that have two list of
+        input tells what category the transform
+        will operate on.
+    Args:
+        must: [] of str,
+            the names of the items that must be included
+            inside the element.
+            If element[name] exist: do the transform
+            If element[name] is None: raise Exception.
+            If element[name] not exist: raise Exception.
+        may: [] of str,
+            the names of the items that may be contained
+            inside the element for transform.
+            If element[name] exist: do the transform
+            If element[name] is None: ignore it.
+            If element[name] not exist: ignore it.
+    """
+    def route(self, item, e, d):
+        """
+        Route the element to a proper function
+            for calculation.
+        Args:
+            self: object,
+                the transform functor.
+            item: str,
+                the item name of the data.
+            e: {},
+                the element
+            d: nparray, tensor or PIL.Image,
+                the data to transform.
+        """
+        if isinstance(d, np.ndarray):
+            dtype = 'nparray'
+        elif isinstance(d, torch.Tensor):
+            dtype = 'tensor'
+        elif isinstance(d, PIL.Image.Image):
+            dtype = 'pilimage'
+        else:
+            raise ValueError
+        # find function by order
+        f = None
+        for attrname in [
+                'exec_{}_{}'.format(item, dtype),
+                'exec_{}'.format(item),
+                'exec_{}'.format(dtype),
+                'exec']:
+            f = getattr(self, attrname, None)
+            if f is not None:
+                break
+        d, e = f(d, e)
+        e[item] = d
+        return e
+    def wrapper(func):
+        def inner(self, e):
+            e['imsize_previous'] = e['imsize_current']
+            imsize_tag_cnt = 0
+            imsize_tag = 'imsize_before_' + self.__class__.__name__
+            while True:
+                if imsize_tag_cnt != 0:
+                    tag = imsize_tag + str(imsize_tag_cnt)
+                else:
+                    tag = imsize_tag
+                if not tag in e:
+                    e[tag] = e['imsize_current']
+                    break
+                imsize_tag_cnt += 1
+            e = func(self, e)
+            # must transform list
+            for item in must:
+                try:
+                    d = e[item]
+                except:
+                    raise ValueError
+                if d is None:
+                    raise ValueError
+                e = route(self, item, e, d)
+            # may transform list
+            for item in may:
+                try:
+                    d = e[item]
+                except:
+                    d = None
+                if d is not None:
+                    e = route(self, item, e, d)
+            return e
+        return inner
+    return wrapper
+class compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, element):
+        for t in self.transforms:
+            element = t(element)
+        return element
+class TBase(object):
+    def __init__(self):
+        pass
+    def exec(self, data, element):
+        raise ValueError
+    def rand(self,
+             uid,
+             tag,
+             rand_f,
+             *args,
+             **kwargs):
+        """
+        Args:
+            uid: string element['unique_id']
+            tag: string tells the tag uses when tracking the random number.
+                Or the tag to restore the tracked random number.
+            rand_f: the random function use to generate random number.
+            **kwargs: the argument for the given random function.
+        """
+        # if rnduh().hdata is not None:
+        #     return rnduh().get_history(uid, self.__class__.__name__, tag)
+        # if rnduh().record_path is None:
+        #     return rand_f(*args, **kwargs)
+        # the special mode to create the random file.
+        d = rand_f(*args, **kwargs)
+        # rnduh().record(uid, self.__class__.__name__, tag, d)
+        return d

lib/evaluator/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .eva_base import get_evaluator

lib/evaluator/eva_base.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import torch
+import torch.distributed as dist
+import os
+import os.path as osp
+import numpy as np
+import copy
+import json
+from ..log_service import print_log
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+@singleton
+class get_evaluator(object):
+    def __init__(self):
+        self.evaluator = {}
+    def register(self, evaf, name):
+        self.evaluator[name] = evaf
+    def __call__(self, pipeline_cfg=None):
+        if pipeline_cfg is None:
+            from . import eva_null
+            return self.evaluator['null']()
+        if not isinstance(pipeline_cfg, list):
+            t = pipeline_cfg.type
+            if t == 'miou':
+                from . import eva_miou
+            if t == 'psnr':
+                from . import eva_psnr
+            if t == 'ssim':
+                from . import eva_ssim
+            if t == 'lpips':
+                from . import eva_lpips
+            if t == 'fid':
+                from . import eva_fid
+            return self.evaluator[t](**pipeline_cfg.args)
+        evaluator = []
+        for ci in pipeline_cfg:
+            t = ci.type
+            if t == 'miou':
+                from . import eva_miou
+            if t == 'psnr':
+                from . import eva_psnr
+            if t == 'ssim':
+                from . import eva_ssim
+            if t == 'lpips':
+                from . import eva_lpips
+            if t == 'fid':
+                from . import eva_fid
+            evaluator.append(
+                self.evaluator[t](**ci.args))
+        if len(evaluator) == 0:
+            return None
+        else:
+            return compose(evaluator)
+def register(name):
+    def wrapper(class_):
+        get_evaluator().register(class_, name)
+        return class_
+    return wrapper
+class base_evaluator(object):
+    def __init__(self,
+                 **args):
+        '''
+        Args:
+            sample_n, int,
+                the total number of sample. used in
+                distributed sync
+        '''
+        if not dist.is_available():
+            raise ValueError
+        self.world_size = dist.get_world_size()
+        self.rank = dist.get_rank()
+        self.sample_n = None
+        self.final = {}
+    def sync(self, data):
+        """
+        Args:
+            data: any,
+                the data needs to be broadcasted
+        """
+        if data is None:
+            return None
+        if isinstance(data, tuple):
+            data = list(data)
+        if isinstance(data, list):
+            data_list = []
+            for datai in data:
+                data_list.append(self.sync(datai))
+            data = [[*i] for i in zip(*data_list)]
+            return data
+        data = [
+            self.sync_(data, ranki)
+                for ranki in range(self.world_size)
+        ]
+        return data
+    def sync_(self, data, rank):
+        t = type(data)
+        is_broadcast = rank == self.rank
+        if t is np.ndarray:
+            dtrans = data
+            dt = data.dtype
+            if dt in [
+                    int,
+                    np.bool,
+                    np.uint8,
+                    np.int8,
+                    np.int16,
+                    np.int32,
+                    np.int64,]:
+                dtt = torch.int64
+            elif dt in [
+                    float,
+                    np.float16,
+                    np.float32,
+                    np.float64,]:
+                dtt = torch.float64
+        elif t is str:
+            dtrans = np.array(
+                [ord(c) for c in data],
+                dtype = np.int64
+            )
+            dt = np.int64
+            dtt = torch.int64
+        else:
+            raise ValueError
+        if is_broadcast:
+            n = len(dtrans.shape)
+            n = torch.tensor(n).long()
+            n = n.to(self.rank)
+            dist.broadcast(n, src=rank)
+            n = list(dtrans.shape)
+            n = torch.tensor(n).long()
+            n = n.to(self.rank)
+            dist.broadcast(n, src=rank)
+            n = torch.tensor(dtrans, dtype=dtt)
+            n = n.to(self.rank)
+            dist.broadcast(n, src=rank)
+            return data
+        n = torch.tensor(0).long()
+        n = n.to(self.rank)
+        dist.broadcast(n, src=rank)
+        n = n.item()
+        n = torch.zeros(n).long()
+        n = n.to(self.rank)
+        dist.broadcast(n, src=rank)
+        n = list(n.to('cpu').numpy())
+        n = torch.zeros(n, dtype=dtt)
+        n = n.to(self.rank)
+        dist.broadcast(n, src=rank)
+        n = n.to('cpu').numpy().astype(dt)
+        if t is np.ndarray:
+            return n
+        elif t is str:
+            n = ''.join([chr(c) for c in n])
+            return n
+    def zipzap_arrange(self, data):
+        '''
+        Order the data so it range like this:
+            input [[0, 2, 4, 6], [1, 3, 5, 7]] -> output [0, 1, 2, 3, 4, 5, ...]
+        '''
+        if isinstance(data[0], list):
+            data_new = []
+            maxlen = max([len(i) for i in data])
+            totlen = sum([len(i) for i in data])
+            cnt = 0
+            for idx in range(maxlen):
+                for datai in data:
+                    data_new += [datai[idx]]
+                    cnt += 1
+                    if cnt >= totlen:
+                        break
+            return data_new
+        elif isinstance(data[0], np.ndarray):
+            maxlen = max([i.shape[0] for i in data])
+            totlen = sum([i.shape[0] for i in data])
+            datai_shape = data[0].shape[1:]
+            data = [
+                np.concatenate(datai, np.zeros(maxlen-datai.shape[0], *datai_shape), axis=0)
+                if datai.shape[0] < maxlen else datai
+                for datai in data
+            ] # even the array
+            data = np.stack(data, axis=1).reshape(-1, *datai_shape)
+            data = data[:totlen]
+            return data
+        else:
+            raise NotImplementedError
+    def add_batch(self, **args):
+        raise NotImplementedError
+    def set_sample_n(self, sample_n):
+        self.sample_n = sample_n
+    def compute(self):
+        raise NotImplementedError
+    # Function needed in training to judge which
+    #   evaluated number is better
+    def isbetter(self, old, new):
+        return new>old
+    def one_line_summary(self):
+        print_log('Evaluator display')
+    def save(self, path):
+        if not osp.exists(path):
+            os.makedirs(path)
+        ofile = osp.join(path, 'result.json')
+        with open(ofile, 'w') as f:
+            json.dump(self.final, f, indent=4)
+    def clear_data(self):
+        raise NotImplementedError
+class compose(object):
+    def __init__(self, pipeline):
+        self.pipeline = pipeline
+        self.sample_n = None
+        self.final = {}
+    def add_batch(self, *args, **kwargs):
+        for pi in self.pipeline:
+            pi.add_batch(*args, **kwargs)
+    def set_sample_n(self, sample_n):
+        self.sample_n = sample_n
+        for pi in self.pipeline:
+            pi.set_sample_n(sample_n)
+    def compute(self):
+        rv = {}
+        for pi in self.pipeline:
+            rv[pi.symbol] = pi.compute()
+            self.final[pi.symbol] = pi.final
+        return rv
+    def isbetter(self, old, new):
+        check = 0
+        for pi in self.pipeline:
+            if pi.isbetter(old, new):
+                check+=1
+        if check/len(self.pipeline)>0.5:
+            return True
+        else:
+            return False
+    def one_line_summary(self):
+        for pi in self.pipeline:
+            pi.one_line_summary()
+    def save(self, path):
+        if not osp.exists(path):
+            os.makedirs(path)
+        ofile = osp.join(path, 'result.json')
+        with open(ofile, 'w') as f:
+            json.dump(self.final, f, indent=4)
+    def clear_data(self):
+        for pi in self.pipeline:
+            pi.clear_data()

lib/evaluator/eva_null.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+import numpy as np
+from .. import nputils
+from ..log_service import print_log
+from .eva_base import base_evaluator, register
+@register('null')
+class null_evaluator(base_evaluator):
+    def __init__(self, **dummy):
+        super().__init__()
+    def add_batch(self,
+                  **dummy):
+        pass
+    def compute(self):
+        return None
+    def one_line_summary(self):
+        print_log('Evaluator null')
+    def clear_data(self):
+        pass

lib/experiments/__init__.py ADDED Viewed

File without changes

lib/experiments/sd_default.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import torch
+import torch.distributed as dist
+from torchvision import transforms as tvtrans
+import os
+import os.path as osp
+import time
+import timeit
+import copy
+import json
+import pickle
+import PIL.Image
+import numpy as np
+from datetime import datetime
+from easydict import EasyDict as edict
+from collections import OrderedDict
+from lib.cfg_holder import cfg_unique_holder as cfguh
+from lib.data_factory import get_dataset, get_sampler, collate
+from lib.model_zoo import \
+    get_model, get_optimizer, get_scheduler
+from lib.log_service import print_log
+from ..utils import train as train_base
+from ..utils import eval as eval_base
+from ..utils import train_stage as tsbase
+from ..utils import eval_stage as esbase
+from .. import sync
+###############
+# some helper #
+###############
+def atomic_save(cfg, net, opt, step, path):
+    if isinstance(net, (torch.nn.DataParallel,
+                        torch.nn.parallel.DistributedDataParallel)):
+        netm = net.module
+    else:
+        netm = net
+    sd = netm.state_dict()
+    slimmed_sd = [(ki, vi) for ki, vi in sd.items()
+        if ki.find('first_stage_model')!=0 and ki.find('cond_stage_model')!=0]
+    checkpoint = {
+        "config" : cfg,
+        "state_dict" : OrderedDict(slimmed_sd),
+        "step" : step}
+    if opt is not None:
+        checkpoint['optimizer_states'] = opt.state_dict()
+    import io
+    import fsspec
+    bytesbuffer = io.BytesIO()
+    torch.save(checkpoint, bytesbuffer)
+    with fsspec.open(path, "wb") as f:
+        f.write(bytesbuffer.getvalue())
+def load_state_dict(net, cfg):
+    pretrained_pth_full  = cfg.get('pretrained_pth_full' , None)
+    pretrained_ckpt_full = cfg.get('pretrained_ckpt_full', None)
+    pretrained_pth       = cfg.get('pretrained_pth'      , None)
+    pretrained_ckpt      = cfg.get('pretrained_ckpt'     , None)
+    pretrained_pth_dm    = cfg.get('pretrained_pth_dm'   , None)
+    pretrained_pth_ema   = cfg.get('pretrained_pth_ema'  , None)
+    strict_sd = cfg.get('strict_sd', False)
+    errmsg = "Overlapped model state_dict! This is undesired behavior!"
+    if pretrained_pth_full is not None or pretrained_ckpt_full is not None:
+        assert (pretrained_pth is None) and \
+               (pretrained_ckpt is None) and \
+               (pretrained_pth_dm is None) and \
+               (pretrained_pth_ema is None), errmsg
+        if pretrained_pth_full is not None:
+            target_file = pretrained_pth_full
+            sd = torch.load(target_file, map_location='cpu')
+            assert pretrained_ckpt is None, errmsg
+        else:
+            target_file = pretrained_ckpt_full
+            sd = torch.load(target_file, map_location='cpu')['state_dict']
+        print_log('Load full model from [{}] strict [{}].'.format(
+            target_file, strict_sd))
+        net.load_state_dict(sd, strict=strict_sd)
+    if pretrained_pth is not None or pretrained_ckpt is not None:
+        assert (pretrained_ckpt_full is None) and \
+               (pretrained_pth_full is None) and \
+               (pretrained_pth_dm is None) and \
+               (pretrained_pth_ema is None), errmsg
+        if pretrained_pth is not None:
+            target_file = pretrained_pth
+            sd = torch.load(target_file, map_location='cpu')
+            assert pretrained_ckpt is None, errmsg
+        else:
+            target_file = pretrained_ckpt
+            sd = torch.load(target_file, map_location='cpu')['state_dict']
+        print_log('Load model from [{}] strict [{}].'.format(
+            target_file, strict_sd))
+        sd_extra = [(ki, vi) for ki, vi in net.state_dict().items() \
+            if ki.find('first_stage_model')==0 or ki.find('cond_stage_model')==0]
+        sd.update(OrderedDict(sd_extra))
+        net.load_state_dict(sd, strict=strict_sd)
+    if pretrained_pth_dm is not None:
+        assert (pretrained_ckpt_full is None) and \
+               (pretrained_pth_full is None) and \
+               (pretrained_pth is None) and \
+               (pretrained_ckpt is None), errmsg
+        print_log('Load diffusion model from [{}] strict [{}].'.format(
+            pretrained_pth_dm, strict_sd))
+        sd = torch.load(pretrained_pth_dm, map_location='cpu')
+        net.model.diffusion_model.load_state_dict(sd, strict=strict_sd)
+    if pretrained_pth_ema is not None:
+        assert (pretrained_ckpt_full is None) and \
+               (pretrained_pth_full is None) and \
+               (pretrained_pth is None) and \
+               (pretrained_ckpt is None), errmsg
+        print_log('Load unet ema model from [{}] strict [{}].'.format(
+            pretrained_pth_ema, strict_sd))
+        sd = torch.load(pretrained_pth_ema, map_location='cpu')
+        net.model_ema.load_state_dict(sd, strict=strict_sd)
+def auto_merge_imlist(imlist, max=64):
+    imlist = imlist[0:max]
+    h, w = imlist[0].shape[0:2]
+    num_images = len(imlist)
+    num_row = int(np.sqrt(num_images))
+    num_col = num_images//num_row + 1 if num_images%num_row!=0 else num_images//num_row
+    canvas = np.zeros([num_row*h, num_col*w, 3], dtype=np.uint8)
+    for idx, im in enumerate(imlist):
+        hi = (idx // num_col) * h
+        wi = (idx % num_col) * w
+        canvas[hi:hi+h, wi:wi+w, :] = im
+    return canvas
+def latent2im(net, latent):
+    single_input = len(latent.shape) == 3
+    if single_input:
+        latent = latent[None]
+    im = net.decode_image(latent.to(net.device))
+    im = torch.clamp((im+1.0)/2.0, min=0.0, max=1.0)
+    im = [tvtrans.ToPILImage()(i) for i in im]
+    if single_input:
+        im = im[0]
+    return im
+def im2latent(net, im):
+    single_input = not isinstance(im, list)
+    if single_input:
+        im = [im]
+    im = torch.stack([tvtrans.ToTensor()(i) for i in im], dim=0)
+    im = (im*2-1).to(net.device)
+    z = net.encode_image(im)
+    if single_input:
+        z = z[0]
+    return z
+class color_adjust(object):
+    def __init__(self, ref_from, ref_to):
+        x0, m0, std0 = self.get_data_and_stat(ref_from)
+        x1, m1, std1 = self.get_data_and_stat(ref_to)
+        self.ref_from_stat = (m0, std0)
+        self.ref_to_stat   = (m1, std1)
+        self.ref_from = self.preprocess(x0).reshape(-1, 3)
+        self.ref_to = x1.reshape(-1, 3)
+    def get_data_and_stat(self, x):
+        if isinstance(x, str):
+            x = np.array(PIL.Image.open(x))
+        elif isinstance(x, PIL.Image.Image):
+            x = np.array(x)
+        elif isinstance(x, torch.Tensor):
+            x = torch.clamp(x, min=0.0, max=1.0)
+            x = np.array(tvtrans.ToPILImage()(x))
+        elif isinstance(x, np.ndarray):
+            pass
+        else:
+            raise ValueError
+        x = x.astype(float)
+        m = np.reshape(x, (-1, 3)).mean(0)
+        s = np.reshape(x, (-1, 3)).std(0)
+        return x, m, s
+    def preprocess(self, x):
+        m0, s0 = self.ref_from_stat
+        m1, s1 = self.ref_to_stat
+        y = ((x-m0)/s0)*s1 + m1
+        return y
+    def __call__(self, xin, keep=0, simple=False):
+        xin, _, _ = self.get_data_and_stat(xin)
+        x = self.preprocess(xin)
+        if simple:
+            y = (x*(1-keep) + xin*keep)
+            y = np.clip(y, 0, 255).astype(np.uint8)
+            return y
+        h, w = x.shape[:2]
+        x = x.reshape(-1, 3)
+        y = []
+        for chi in range(3):
+            yi = self.pdf_transfer_1d(self.ref_from[:, chi], self.ref_to[:, chi], x[:, chi])
+            y.append(yi)
+        y = np.stack(y, axis=1)
+        y = y.reshape(h, w, 3)
+        y = (y.astype(float)*(1-keep) + xin.astype(float)*keep)
+        y = np.clip(y, 0, 255).astype(np.uint8)
+        return y
+    def pdf_transfer_1d(self, arr_fo, arr_to, arr_in, n=600):
+        arr = np.concatenate((arr_fo, arr_to))
+        min_v = arr.min() - 1e-6
+        max_v = arr.max() + 1e-6
+        min_vto = arr_to.min() - 1e-6
+        max_vto = arr_to.max() + 1e-6
+        xs = np.array(
+            [min_v + (max_v - min_v) * i / n for i in range(n + 1)])
+        hist_fo, _ = np.histogram(arr_fo, xs)
+        hist_to, _ = np.histogram(arr_to, xs)
+        xs = xs[:-1]
+        # compute probability distribution
+        cum_fo = np.cumsum(hist_fo)
+        cum_to = np.cumsum(hist_to)
+        d_fo = cum_fo / cum_fo[-1]
+        d_to = cum_to / cum_to[-1]
+        # transfer
+        t_d = np.interp(d_fo, d_to, xs)
+        t_d[d_fo <= d_to[ 0]] = min_vto
+        t_d[d_fo >= d_to[-1]] = max_vto
+        arr_out = np.interp(arr_in, xs, t_d)
+        return arr_out
+########
+# main #
+########
+class eval(eval_base):
+    def prepare_model(self):
+        cfg = cfguh().cfg
+        net = get_model()(cfg.model)
+        if cfg.env.cuda:
+            net.to(self.local_rank)
+            load_state_dict(net, cfg.eval) #<--- added
+            net = torch.nn.parallel.DistributedDataParallel(
+                net, device_ids=[self.local_rank],
+                find_unused_parameters=True)
+        net.eval()
+        return {'net' : net,}
+class eval_stage(esbase):
+    """
+    This is eval stage that can check comprehensive results
+    """
+    def __init__(self):
+        from ..model_zoo.ddim import DDIMSampler
+        self.sampler = DDIMSampler
+    def get_net(self, paras):
+        return paras['net']
+    def get_image_path(self):
+        if 'train' in cfguh().cfg:
+            log_dir = cfguh().cfg.train.log_dir
+        else:
+            log_dir = cfguh().cfg.eval.log_dir
+        return os.path.join(log_dir, "udemo")
+    @torch.no_grad()
+    def sample(self, net, sampler, prompt, output_dim, scale, n_samples, ddim_steps, ddim_eta):
+        h, w = output_dim
+        uc = None
+        if scale != 1.0:
+            uc = net.get_learned_conditioning(n_samples * [""])
+        c = net.get_learned_conditioning(n_samples * [prompt])
+        shape = [4, h//8, w//8]
+        rv = sampler.sample(
+            S=ddim_steps,
+            conditioning=c,
+            batch_size=n_samples,
+            shape=shape,
+            verbose=False,
+            unconditional_guidance_scale=scale,
+            unconditional_conditioning=uc,
+            eta=ddim_eta)
+        return rv
+    def save_images(self, pil_list, name, path, suffix=''):
+        canvas = auto_merge_imlist([np.array(i) for i in pil_list])
+        image_name = '{}{}.png'.format(name, suffix)
+        PIL.Image.fromarray(canvas).save(osp.join(path, image_name))
+    def __call__(self, **paras):
+        cfg = cfguh().cfg
+        cfgv = cfg.eval
+        net = paras['net']
+        eval_cnt = paras.get('eval_cnt', None)
+        fix_seed = cfgv.get('fix_seed', False)
+        LRANK = sync.get_rank('local')
+        LWSIZE = sync.get_world_size('local')
+        image_path = self.get_image_path()
+        self.create_dir(image_path)
+        eval_cnt = paras.get('eval_cnt', None)
+        suffix='' if eval_cnt is None else '_itern'+str(eval_cnt)
+        if isinstance(net, (torch.nn.DataParallel,
+                            torch.nn.parallel.DistributedDataParallel)):
+            netm = net.module
+        else:
+            netm = net
+        with_ema = getattr(netm, 'model_ema', None) is not None
+        sampler = self.sampler(netm)
+        setattr(netm, 'device', LRANK) # Trick
+        replicate = cfgv.get('replicate', 1)
+        conditioning = cfgv.conditioning * replicate
+        conditioning_local = conditioning[LRANK : len(conditioning) : LWSIZE]
+        seed_increment = [i for i in range(len(conditioning))][LRANK : len(conditioning) : LWSIZE]
+        for prompti, seedi in zip(conditioning_local, seed_increment):
+            if prompti == 'SKIP':
+                continue
+            draw_filename = prompti.strip().replace(' ', '-')
+            if fix_seed:
+                np.random.seed(cfg.env.rnd_seed + seedi)
+                torch.manual_seed(cfg.env.rnd_seed + seedi + 100)
+                suffixi = suffix + "_seed{}".format(cfg.env.rnd_seed + seedi + 100)
+            else:
+                suffixi = suffix
+            if with_ema:
+                with netm.ema_scope():
+                    x, _ = self.sample(netm, sampler, prompti, **cfgv.sample)
+            else:
+                x, _ = self.sample(netm, sampler, prompti, **cfgv.sample)
+            demo_image = latent2im(netm, x)
+            self.save_images(demo_image, draw_filename, image_path, suffix=suffixi)
+        if eval_cnt is not None:
+            print_log('Demo printed for {}'.format(eval_cnt))
+        return {}
+##################
+# eval variation #
+##################
+class eval_stage_variation(eval_stage):
+    @torch.no_grad()
+    def sample(self, net, sampler, visual_hint, output_dim, scale, n_samples, ddim_steps, ddim_eta):
+        h, w = output_dim
+        vh = tvtrans.ToTensor()(PIL.Image.open(visual_hint))[None].to(net.device)
+        c = net.get_learned_conditioning(vh)
+        c = c.repeat(n_samples, 1, 1)
+        uc = None
+        if scale != 1.0:
+            dummy = torch.zeros_like(vh)
+            uc = net.get_learned_conditioning(dummy)
+            uc = uc.repeat(n_samples, 1, 1)
+        shape = [4, h//8, w//8]
+        rv = sampler.sample(
+            S=ddim_steps,
+            conditioning=c,
+            batch_size=n_samples,
+            shape=shape,
+            verbose=False,
+            unconditional_guidance_scale=scale,
+            unconditional_conditioning=uc,
+            eta=ddim_eta)
+        return rv
+    def __call__(self, **paras):
+        cfg = cfguh().cfg
+        cfgv = cfg.eval
+        net = paras['net']
+        eval_cnt = paras.get('eval_cnt', None)
+        fix_seed = cfgv.get('fix_seed', False)
+        LRANK = sync.get_rank('local')
+        LWSIZE = sync.get_world_size('local')
+        image_path = self.get_image_path()
+        self.create_dir(image_path)
+        eval_cnt = paras.get('eval_cnt', None)
+        suffix='' if eval_cnt is None else '_'+str(eval_cnt)
+        if isinstance(net, (torch.nn.DataParallel,
+                            torch.nn.parallel.DistributedDataParallel)):
+            netm = net.module
+        else:
+            netm = net
+        with_ema = getattr(netm, 'model_ema', None) is not None
+        sampler = self.sampler(netm)
+        setattr(netm, 'device', LRANK) # Trick
+        color_adj = cfguh().cfg.eval.get('color_adj', False)
+        color_adj_keep_ratio = cfguh().cfg.eval.get('color_adj_keep_ratio', 0.5)
+        color_adj_simple = cfguh().cfg.eval.get('color_adj_simple', True)
+        replicate = cfgv.get('replicate', 1)
+        conditioning = cfgv.conditioning * replicate
+        conditioning_local = conditioning[LRANK : len(conditioning) : LWSIZE]
+        seed_increment = [i for i in range(len(conditioning))][LRANK : len(conditioning) : LWSIZE]
+        for ci, seedi in zip(conditioning_local, seed_increment):
+            if ci == 'SKIP':
+                continue
+            draw_filename = osp.splitext(osp.basename(ci))[0]
+            if fix_seed:
+                np.random.seed(cfg.env.rnd_seed + seedi)
+                torch.manual_seed(cfg.env.rnd_seed + seedi + 100)
+                suffixi = suffix + "_seed{}".format(cfg.env.rnd_seed + seedi + 100)
+            else:
+                suffixi = suffix
+            if with_ema:
+                with netm.ema_scope():
+                    x, _ = self.sample(netm, sampler, ci, **cfgv.sample)
+            else:
+                x, _ = self.sample(netm, sampler, ci, **cfgv.sample)
+            demo_image = latent2im(netm, x)
+            if color_adj:
+                x_adj = []
+                for demoi in demo_image:
+                    color_adj_f = color_adjust(ref_from=demoi, ref_to=ci)
+                    xi_adj = color_adj_f(demoi, keep=color_adj_keep_ratio, simple=color_adj_simple)
+                    x_adj.append(xi_adj)
+                demo_image = x_adj
+            self.save_images(demo_image, draw_filename, image_path, suffix=suffixi)
+        if eval_cnt is not None:
+            print_log('Demo printed for {}'.format(eval_cnt))
+        return {}

lib/log_service.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import timeit
+import numpy as np
+import os
+import os.path as osp
+import shutil
+import copy
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from .cfg_holder import cfg_unique_holder as cfguh
+from . import sync
+print_console_local_rank0_only = True
+def print_log(*console_info):
+    local_rank = sync.get_rank('local')
+    if print_console_local_rank0_only and (local_rank!=0):
+        return
+    console_info = [str(i) for i in console_info]
+    console_info = ' '.join(console_info)
+    print(console_info)
+    if local_rank!=0:
+        return
+    log_file = None
+    try:
+        log_file = cfguh().cfg.train.log_file
+    except:
+        try:
+            log_file = cfguh().cfg.eval.log_file
+        except:
+            return
+    if log_file is not None:
+        with open(log_file, 'a') as f:
+            f.write(console_info + '\n')
+class distributed_log_manager(object):
+    def __init__(self):
+        self.sum = {}
+        self.cnt = {}
+        self.time_check = timeit.default_timer()
+        cfgt = cfguh().cfg.train
+        use_tensorboard = getattr(cfgt, 'log_tensorboard', False)
+        self.ddp = sync.is_ddp()
+        self.rank = sync.get_rank('local')
+        self.world_size = sync.get_world_size('local')
+        self.tb = None
+        if use_tensorboard and (self.rank==0):
+            import tensorboardX
+            monitoring_dir = osp.join(cfguh().cfg.train.log_dir, 'tensorboard')
+            self.tb = tensorboardX.SummaryWriter(osp.join(monitoring_dir))
+    def accumulate(self, n, **data):
+        if n < 0:
+            raise ValueError
+        for itemn, di in data.items():
+            if itemn in self.sum:
+                self.sum[itemn] += di * n
+                self.cnt[itemn] += n
+            else:
+                self.sum[itemn] = di * n
+                self.cnt[itemn] = n
+    def get_mean_value_dict(self):
+        value_gather = [
+            self.sum[itemn]/self.cnt[itemn] \
+                for itemn in sorted(self.sum.keys()) ]
+        value_gather_tensor = torch.FloatTensor(value_gather).to(self.rank)
+        if self.ddp:
+            dist.all_reduce(value_gather_tensor, op=dist.ReduceOp.SUM)
+            value_gather_tensor /= self.world_size
+        mean = {}
+        for idx, itemn in enumerate(sorted(self.sum.keys())):
+            mean[itemn] = value_gather_tensor[idx].item()
+        return mean
+    def tensorboard_log(self, step, data, mode='train', **extra):
+        if self.tb is None:
+            return
+        if mode == 'train':
+            self.tb.add_scalar('other/epochn', extra['epochn'], step)
+            if 'lr' in extra:
+                self.tb.add_scalar('other/lr', extra['lr'], step)
+            for itemn, di in data.items():
+                if itemn.find('loss') == 0:
+                    self.tb.add_scalar('loss/'+itemn,  di, step)
+                elif itemn == 'Loss':
+                    self.tb.add_scalar('Loss',  di, step)
+                else:
+                    self.tb.add_scalar('other/'+itemn, di, step)
+        elif mode == 'eval':
+            if isinstance(data, dict):
+                for itemn, di in data.items():
+                    self.tb.add_scalar('eval/'+itemn, di, step)
+            else:
+                self.tb.add_scalar('eval', data, step)
+        return
+    def train_summary(self, itern, epochn, samplen, lr, tbstep=None):
+        console_info = [
+            'Iter:{}'.format(itern),
+            'Epoch:{}'.format(epochn),
+            'Sample:{}'.format(samplen),]
+        if lr is not None:
+            console_info += ['LR:{:.4E}'.format(lr)]
+        mean = self.get_mean_value_dict()
+        tbstep = itern if tbstep is None else tbstep
+        self.tensorboard_log(
+            tbstep, mean, mode='train',
+            itern=itern, epochn=epochn, lr=lr)
+        loss = mean.pop('Loss')
+        mean_info = ['Loss:{:.4f}'.format(loss)] + [
+            '{}:{:.4f}'.format(itemn, mean[itemn]) \
+                for itemn in sorted(mean.keys()) \
+                    if itemn.find('loss') == 0
+        ]
+        console_info += mean_info
+        console_info.append('Time:{:.2f}s'.format(
+            timeit.default_timer() - self.time_check))
+        return ' , '.join(console_info)
+    def clear(self):
+        self.sum = {}
+        self.cnt = {}
+        self.time_check = timeit.default_timer()
+    def tensorboard_close(self):
+        if self.tb is not None:
+            self.tb.close()
+# ----- also include some small utils -----
+def torch_to_numpy(*argv):
+    if len(argv) > 1:
+        data = list(argv)
+    else:
+        data = argv[0]
+    if isinstance(data, torch.Tensor):
+        return data.to('cpu').detach().numpy()
+    elif isinstance(data, (list, tuple)):
+        out = []
+        for di in data:
+            out.append(torch_to_numpy(di))
+        return out
+    elif isinstance(data, dict):
+        out = {}
+        for ni, di in data.items():
+            out[ni] = torch_to_numpy(di)
+        return out
+    else:
+        return data

lib/model_zoo/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .common.get_model import get_model
+from .common.get_optimizer import get_optimizer
+from .common.get_scheduler import get_scheduler
+from .common.utils import get_unit

lib/model_zoo/attention.py ADDED Viewed

	@@ -0,0 +1,435 @@

+from inspect import isfunction
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+from einops import rearrange, repeat
+from .diffusion_utils import checkpoint
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = rearrange(q, 'b c h w -> b (h w) c')
+        k = rearrange(k, 'b c h w -> b c (h w)')
+        w_ = torch.einsum('bij,bjk->bik', q, k)
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = rearrange(v, 'b c h w -> b c (h w)')
+        w_ = rearrange(w_, 'b i j -> b j i')
+        h_ = torch.einsum('bij,bjk->bik', v, w_)
+        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
+        h_ = self.proj_out(h_)
+        return x+h_
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        # attention, what we cannot get enough of
+        attn = sim.softmax(dim=-1)
+        out = einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        return self.to_out(out)
+class BasicTransformerBlock(nn.Module):
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
+                 disable_self_attn=False):
+        super().__init__()
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
+                                    context_dim=context_dim if self.disable_self_attn else None)  # is a self-attention if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
+                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None):
+        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+    def _forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(self, in_channels, n_heads, d_head,
+                 depth=1, dropout=0., context_dim=None,
+                 disable_self_attn=False):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        self.proj_in = nn.Conv2d(in_channels,
+                                 inner_dim,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.transformer_blocks = nn.ModuleList(
+            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim,
+                                   disable_self_attn=disable_self_attn)
+                for d in range(depth)]
+        )
+        self.proj_out = zero_module(nn.Conv2d(inner_dim,
+                                              in_channels,
+                                              kernel_size=1,
+                                              stride=1,
+                                              padding=0))
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+        x = self.proj_out(x)
+        return x + x_in
+##########################
+# transformer no context #
+##########################
+class BasicTransformerBlockNoContext(nn.Module):
+    def __init__(self, dim, n_heads, d_head, dropout=0., gated_ff=True, checkpoint=True):
+        super().__init__()
+        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head,
+                                    dropout=dropout, context_dim=None)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head,
+                                    dropout=dropout, context_dim=None)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x):
+        return checkpoint(self._forward, (x,), self.parameters(), self.checkpoint)
+    def _forward(self, x):
+        x = self.attn1(self.norm1(x)) + x
+        x = self.attn2(self.norm2(x)) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+class SpatialTransformerNoContext(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(self, in_channels, n_heads, d_head,
+                 depth=1, dropout=0.,):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        self.proj_in = nn.Conv2d(in_channels,
+                                 inner_dim,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.transformer_blocks = nn.ModuleList(
+            [BasicTransformerBlockNoContext(inner_dim, n_heads, d_head, dropout=dropout)
+                for d in range(depth)]
+        )
+        self.proj_out = zero_module(nn.Conv2d(inner_dim,
+                                              in_channels,
+                                              kernel_size=1,
+                                              stride=1,
+                                              padding=0))
+    def forward(self, x):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
+        for block in self.transformer_blocks:
+            x = block(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+        x = self.proj_out(x)
+        return x + x_in
+#######################################
+# Spatial Transformer with Two Branch #
+#######################################
+class DualSpatialTransformer(nn.Module):
+    def __init__(self, in_channels, n_heads, d_head,
+                 depth=1, dropout=0., context_dim=None,
+                 disable_self_attn=False):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        # First crossattn
+        self.norm_0 = Normalize(in_channels)
+        self.proj_in_0 = nn.Conv2d(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        self.transformer_blocks_0 = nn.ModuleList(
+            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim,
+                                   disable_self_attn=disable_self_attn)
+                for d in range(depth)]
+        )
+        self.proj_out_0 = zero_module(nn.Conv2d(
+            inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
+        # Second crossattn
+        self.norm_1 = Normalize(in_channels)
+        self.proj_in_1 = nn.Conv2d(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        self.transformer_blocks_1 = nn.ModuleList(
+            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim,
+                                   disable_self_attn=disable_self_attn)
+                for d in range(depth)]
+        )
+        self.proj_out_1 = zero_module(nn.Conv2d(
+            inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
+    def forward(self, x, context=None, which=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, c, h, w = x.shape
+        x_in = x
+        if which==0:
+            norm, proj_in, blocks, proj_out = \
+                self.norm_0, self.proj_in_0, self.transformer_blocks_0, self.proj_out_0
+        elif which==1:
+            norm, proj_in, blocks, proj_out = \
+                self.norm_1, self.proj_in_1, self.transformer_blocks_1, self.proj_out_1
+        else:
+            # assert False, 'DualSpatialTransformer forward with a invalid which branch!'
+            # import numpy.random as npr
+            # rwhich = 0 if npr.rand() < which else 1
+            # context = context[rwhich]
+            # if rwhich==0:
+            #     norm, proj_in, blocks, proj_out = \
+            #         self.norm_0, self.proj_in_0, self.transformer_blocks_0, self.proj_out_0
+            # elif rwhich==1:
+            #     norm, proj_in, blocks, proj_out = \
+            #         self.norm_1, self.proj_in_1, self.transformer_blocks_1, self.proj_out_1
+            # import numpy.random as npr
+            # rwhich = 0 if npr.rand() < 0.33 else 1
+            # if rwhich==0:
+            #     context = context[rwhich]
+            #     norm, proj_in, blocks, proj_out = \
+            #         self.norm_0, self.proj_in_0, self.transformer_blocks_0, self.proj_out_0
+            # else:
+            norm, proj_in, blocks, proj_out = \
+                self.norm_0, self.proj_in_0, self.transformer_blocks_0, self.proj_out_0
+            x0 = norm(x)
+            x0 = proj_in(x0)
+            x0 = rearrange(x0, 'b c h w -> b (h w) c').contiguous()
+            for block in blocks:
+                x0 = block(x0, context=context[0])
+            x0 = rearrange(x0, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+            x0 = proj_out(x0)
+            norm, proj_in, blocks, proj_out = \
+                self.norm_1, self.proj_in_1, self.transformer_blocks_1, self.proj_out_1
+            x1 = norm(x)
+            x1 = proj_in(x1)
+            x1 = rearrange(x1, 'b c h w -> b (h w) c').contiguous()
+            for block in blocks:
+                x1 = block(x1, context=context[1])
+            x1 = rearrange(x1, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+            x1 = proj_out(x1)
+            return x0*which + x1*(1-which) + x_in
+        x = norm(x)
+        x = proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
+        for block in blocks:
+            x = block(x, context=context)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+        x = proj_out(x)
+        return x + x_in

lib/model_zoo/autoencoder.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from contextlib import contextmanager
+from lib.model_zoo.common.get_model import get_model, register
+from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+from .diffusion_modules import Encoder, Decoder
+from .distributions import DiagonalGaussianDistribution
+class VQModel(nn.Module):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 batch_resize_range=None,
+                 scheduler_config=None,
+                 lr_g_factor=1.0,
+                 remap=None,
+                 sane_index_shape=False, # tell vector quantizer to return indices as bhw
+                 use_ema=False
+                 ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.n_embed = n_embed
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
+                                        remap=remap,
+                                        sane_index_shape=sane_index_shape)
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.batch_resize_range = batch_resize_range
+        if self.batch_resize_range is not None:
+            print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.scheduler_config = scheduler_config
+        self.lr_g_factor = lr_g_factor
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+            print(f"Unexpected Keys: {unexpected}")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def encode_to_prequant(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input, return_pred_indices=False):
+        quant, diff, (_,_,ind) = self.encode(input)
+        dec = self.decode(quant)
+        if return_pred_indices:
+            return dec, diff, ind
+        return dec, diff
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        if self.batch_resize_range is not None:
+            lower_size = self.batch_resize_range[0]
+            upper_size = self.batch_resize_range[1]
+            if self.global_step <= 4:
+                # do the first few batches with max size to avoid later oom
+                new_resize = upper_size
+            else:
+                new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
+            if new_resize != x.shape[2]:
+                x = F.interpolate(x, size=new_resize, mode="bicubic")
+            x = x.detach()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        # https://github.com/pytorch/pytorch/issues/37142
+        # try not to fool the heuristics
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train",
+                                            predicted_indices=ind)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return aeloss
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
+        return log_dict
+    def _validation_step(self, batch, batch_idx, suffix=""):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
+                                        self.global_step,
+                                        last_layer=self.get_last_layer(),
+                                        split="val"+suffix,
+                                        predicted_indices=ind
+                                        )
+        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
+                                            self.global_step,
+                                            last_layer=self.get_last_layer(),
+                                            split="val"+suffix,
+                                            predicted_indices=ind
+                                            )
+        rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
+        self.log(f"val{suffix}/rec_loss", rec_loss,
+                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        self.log(f"val{suffix}/aeloss", aeloss,
+                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        if version.parse(pl.__version__) >= version.parse('1.4.0'):
+            del log_dict_ae[f"val{suffix}/rec_loss"]
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr_d = self.learning_rate
+        lr_g = self.lr_g_factor*self.learning_rate
+        print("lr_d", lr_d)
+        print("lr_g", lr_g)
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quantize.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr_g, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr_d, betas=(0.5, 0.9))
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+                {
+                    'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+            ]
+            return [opt_ae, opt_disc], scheduler
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if only_inputs:
+            log["inputs"] = x
+            return log
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        if plot_ema:
+            with self.ema_scope():
+                xrec_ema, _ = self(x)
+                if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
+                log["reconstructions_ema"] = xrec_ema
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class VQModelInterface(VQModel):
+    def __init__(self, embed_dim, *args, **kwargs):
+        super().__init__(embed_dim=embed_dim, *args, **kwargs)
+        self.embed_dim = embed_dim
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    def decode(self, h, force_not_quantize=False):
+        # also go through quantization layer
+        if not force_not_quantize:
+            quant, emb_loss, info = self.quantize(h)
+        else:
+            quant = h
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+@register('autoencoderkl')
+class AutoencoderKL(nn.Module):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return aeloss
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                                last_layer=self.get_last_layer(), split="train")
+            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val")
+        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+        log["inputs"] = x
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class IdentityFirstStage(nn.Module):
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
+        super().__init__()
+    def encode(self, x, *args, **kwargs):
+        return x
+    def decode(self, x, *args, **kwargs):
+        return x
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+    def forward(self, x, *args, **kwargs):
+        return x

lib/model_zoo/bert.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import torch
+import torch.nn as nn
+from functools import partial
+# from ldm.modules.x_transformer import Encoder, TransformerWrapper  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+class ClassEmbedder(nn.Module):
+    def __init__(self, embed_dim, n_classes=1000, key='class'):
+        super().__init__()
+        self.key = key
+        self.embedding = nn.Embedding(n_classes, embed_dim)
+    def forward(self, batch, key=None):
+        if key is None:
+            key = self.key
+        # this is for use in crossattn
+        c = batch[key][:, None]
+        c = self.embedding(c)
+        return c
+class TransformerEmbedder(AbstractEncoder):
+    """Some transformer encoder layers"""
+    def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77):
+        super().__init__()
+        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
+                                              attn_layers=Encoder(dim=n_embed, depth=n_layer))
+    def forward(self, tokens):
+        z = self.transformer(tokens, return_embeddings=True)
+        return z
+    def encode(self, x):
+        return self(x)
+class BERTTokenizer(AbstractEncoder):
+    """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
+    def __init__(self, device="cuda", vq_interface=True, max_length=77):
+        super().__init__()
+        from transformers import BertTokenizerFast  # TODO: add to reuquirements
+        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        self.vq_interface = vq_interface
+        self.max_length = max_length
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"]
+        return tokens
+    @torch.no_grad()
+    def encode(self, text):
+        tokens = self(text)
+        if not self.vq_interface:
+            return tokens
+        return None, None, [None, None, tokens]
+    def decode(self, text):
+        return text
+class BERTEmbedder(AbstractEncoder):
+    """Uses the BERT tokenizr model and add some transformer encoder layers"""
+    def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
+                 ckpt_path=None, ignore_keys=[], device="cuda", use_tokenizer=True, embedding_dropout=0.0):
+        super().__init__()
+        self.use_tknz_fn = use_tokenizer
+        if self.use_tknz_fn:
+            self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
+        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
+                                              attn_layers=Encoder(dim=n_embed, depth=n_layer),
+                                              emb_dropout=embedding_dropout)
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+    def forward(self, text):
+        if self.use_tknz_fn:
+            tokens = self.tknz_fn(text)
+        else:
+            tokens = text
+        device = self.transformer.token_emb.weight.device # a trick to get device
+        tokens = tokens.to(device)
+        z = self.transformer(tokens, return_embeddings=True)
+        return z
+    def encode(self, text):
+        # output of length 77
+        return self(text)
+class SpatialRescaler(nn.Module):
+    def __init__(self,
+                 n_stages=1,
+                 method='bilinear',
+                 multiplier=0.5,
+                 in_channels=3,
+                 out_channels=None,
+                 bias=False):
+        super().__init__()
+        self.n_stages = n_stages
+        assert self.n_stages >= 0
+        assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
+        self.multiplier = multiplier
+        self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
+        self.remap_output = out_channels is not None
+        if self.remap_output:
+            print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
+            self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
+    def forward(self,x):
+        for stage in range(self.n_stages):
+            x = self.interpolator(x, scale_factor=self.multiplier)
+        if self.remap_output:
+            x = self.channel_mapper(x)
+        return x
+    def encode(self, x):
+        return self(x)

lib/model_zoo/clip.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from functools import partial
+from lib.model_zoo.common.get_model import register
+version = '0'
+symbol = 'clip'
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+from transformers import CLIPTokenizer, CLIPTextModel
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+@register('clip_text_frozen', version)
+class FrozenCLIPTextEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):  # clip-vit-base-patch32
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length   # TODO: typical value?
+        self.freeze()
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        #self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        return z
+    def encode(self, text):
+        return self(text)
+from transformers import CLIPProcessor, CLIPVisionModel
+@register('clip_vision_frozen', version)
+class FrozenCLIPVisionEmbedder(AbstractEncoder):
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):  # clip-vit-base-patch32
+        super().__init__()
+        self.processor = CLIPProcessor.from_pretrained(version)
+        self.transformer = CLIPVisionModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length   # TODO: typical value?
+        self.freeze()
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        #self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, images):
+        inputs = self.processor(images=images, return_tensors="pt")
+        pixels = inputs['pixel_values'].to(self.device)
+        outputs = self.transformer(pixel_values=pixels)
+        z = outputs.last_hidden_state
+        return z
+    def encode(self, image):
+        return self(image)
+from transformers import CLIPModel
+@register('clip_frozen', version)
+class FrozenCLIP(AbstractEncoder):
+    def __init__(self,
+                 version="openai/clip-vit-large-patch14",
+                 max_length=77,
+                 encode_type='encode_text',):  # clip-vit-base-patch32
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.processor = CLIPProcessor.from_pretrained(version)
+        self.model = CLIPModel.from_pretrained(version)
+        self.max_length = max_length  # TODO: typical value?
+        self.encode_type = encode_type
+        self.pinv_text_projection = None
+        self.freeze()
+    def get_device(self):
+        # A trick to get device
+        return self.model.text_projection.weight.device
+    def freeze(self):
+        self.model = self.model.eval()
+        #self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+    def encode_text_pooled(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.get_device())
+        return self.model.get_text_features(input_ids=tokens)
+    def encode_vision_pooled(self, images):
+        inputs = self.processor(images=images, return_tensors="pt")
+        pixels = inputs['pixel_values'].to(self.get_device())
+        return self.model.get_image_features(pixel_values=pixels)
+    def encode_text_noproj(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.get_device())
+        outputs = self.model.text_model(input_ids=tokens)
+        return outputs.last_hidden_state
+    def encode_vision_noproj(self, images):
+        inputs = self.processor(images=images, return_tensors="pt")
+        pixels = inputs['pixel_values'].to(self.get_device())
+        outputs = self.model.vision_model(pixel_values=pixels)
+        return outputs.last_hidden_state
+    def encode_text_bug(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.get_device())
+        outputs = self.model.text_model(input_ids=tokens)
+        z = outputs.last_hidden_state
+        z_pooled = outputs.pooler_output
+        z = z / torch.norm(z_pooled.unsqueeze(1), dim=-1, keepdim=True)
+        return self.model.text_projection(z)
+    def encode_text(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.get_device())
+        outputs = self.model.text_model(input_ids=tokens)
+        z = self.model.text_projection(outputs.last_hidden_state)
+        z_pooled = self.model.text_projection(outputs.pooler_output)
+        z = z / torch.norm(z_pooled.unsqueeze(1), dim=-1, keepdim=True)
+        return z
+    def encode_vision(self, images):
+        z = self.encode_vision_noproj(images)
+        z = self.model.vision_model.post_layernorm(z)
+        z = self.model.visual_projection(z)
+        z_pooled = z[:, 0:1]
+        # z_pooled_normed = z_pooled / z_pooled.norm(dim=-1, keepdim=True)
+        z = z / torch.norm(z_pooled, dim=-1, keepdim=True)
+        return z
+    def encode_vision_pinvtext(self, images):
+        blank_text_encode_norm_avg = 28.9096
+        z = self.encode_vision(images)
+        if self.pinv_text_projection is None:
+            self.pinv_text_projection = torch.linalg.pinv(self.model.text_projection.weight).T
+        z = torch.matmul(z, self.pinv_text_projection)
+        # z = z / torch.norm(z[:, 0:1], dim=-1, keepdim=True)
+        z = z / torch.norm(z, dim=-1, keepdim=True)
+        z = z*blank_text_encode_norm_avg
+        # return z[:, 1:2].repeat(1, 77, 1)
+        z2 = self.encode_text_noproj('')
+        # z2[:, 1:77] = z[:, 0:76]
+        return torch.flip(z, dims=(1,))[:, 0:77]
+    def encode(self, *args, **kwargs):
+        return getattr(self, self.encode_type)(*args, **kwargs)
+#############################
+# copyed from justin's code #
+#############################
+@register('clip_vision_frozen_justin', version)
+class FrozenCLIPVisionEmbedder_Justin(AbstractEncoder):
+    """
+        Uses the CLIP image encoder.
+        """
+    def __init__(
+            self,
+            model='ViT-L/14',
+            jit=False,
+            device='cuda' if torch.cuda.is_available() else 'cpu',
+            antialias=False,
+        ):
+        super().__init__()
+        from . import clip_justin
+        self.model, _ = clip_justin.load(name=model, device=device, jit=jit)
+        self.device = device
+        self.antialias = antialias
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+        # I didn't call this originally, but seems like it was frozen anyway
+        self.freeze()
+    def freeze(self):
+        self.transformer = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def preprocess(self, x):
+        import kornia
+        # Expects inputs in the range -1, 1
+        x = kornia.geometry.resize(x, (224, 224),
+                                   interpolation='bicubic',align_corners=True,
+                                   antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+    def forward(self, x):
+        # x is assumed to be in range [-1,1]
+        return self.model.encode_image(self.preprocess(x)).float()
+    def encode(self, im):
+        return self(im).unsqueeze(1)

lib/model_zoo/clip_justin/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .clip import load

lib/model_zoo/clip_justin/clip.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import hashlib
+import os
+import urllib
+import warnings
+from typing import Any, Union, List
+from pkg_resources import packaging
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from tqdm import tqdm
+from .model import build_model
+# from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+if packaging.version.parse(torch.__version__) < packaging.version.parse("1.7.1"):
+    warnings.warn("PyTorch version 1.7.1 or higher is recommended")
+__all__ = ["available_models", "load", "tokenize"]
+# _tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
+}
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/clip"
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    with open(model_path, 'rb') as opened_file:
+        try:
+            # loading JIT archive
+            model = torch.jit.load(opened_file, map_location=device if jit else "cpu").eval()
+            state_dict = None
+        except RuntimeError:
+            # loading saved state dict
+            if jit:
+                warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+                jit = False
+            state_dict = torch.load(opened_file, map_location="cpu")
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, _transform(model.input_resolution.item())
+# def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
+#     """
+#     Returns the tokenized representation of given input string(s)
+#     Parameters
+#     ----------
+#     texts : Union[str, List[str]]
+#         An input string or a list of input strings to tokenize
+#     context_length : int
+#         The context length to use; all CLIP models use 77 as the context length
+#     truncate: bool
+#         Whether to truncate the text in case its encoding is longer than the context length
+#     Returns
+#     -------
+#     A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
+#     We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
+#     """
+#     if isinstance(texts, str):
+#         texts = [texts]
+#     sot_token = _tokenizer.encoder["<|startoftext|>"]
+#     eot_token = _tokenizer.encoder["<|endoftext|>"]
+#     all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+#     if packaging.version.parse(torch.__version__) < packaging.version.parse("1.8.0"):
+#         result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+#     else:
+#         result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)
+#     for i, tokens in enumerate(all_tokens):
+#         if len(tokens) > context_length:
+#             if truncate:
+#                 tokens = tokens[:context_length]
+#                 tokens[-1] = eot_token
+#             else:
+#                 raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+#         result[i, :len(tokens)] = torch.tensor(tokens)
+#     return result

lib/model_zoo/clip_justin/model.py ADDED Viewed

	@@ -0,0 +1,436 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()