UniPic

Running on Zero

App Files Files Community

yichenchenchen commited on 13 days ago

Commit

ea88892

verified ·

1 Parent(s): 352e41f

Upload 25 files

Browse files

Files changed (25) hide show

src/builder.py +4 -0
src/datasets/collate_functions.py +90 -0
src/datasets/samplers/multi_source_sampler.py +202 -0
src/datasets/text2image/__init__.py +0 -0
src/datasets/text2image/text2image.py +649 -0
src/datasets/understanding/caption_datasets.py +342 -0
src/datasets/understanding/caption_prompts.py +28 -0
src/datasets/understanding/vlm_datasets_sig.py +168 -0
src/datasets/utils.py +303 -0
src/models/mar/decoder.py +102 -0
src/models/mar/diffloss.py +249 -0
src/models/mar/diffusion/__init__.py +47 -0
src/models/mar/diffusion/diffusion_utils.py +73 -0
src/models/mar/diffusion/gaussian_diffusion.py +884 -0
src/models/mar/diffusion/respace.py +129 -0
src/models/mar/engine_mar.py +99 -0
src/models/mar/mar.py +477 -0
src/models/mar/misc.py +340 -0
src/models/mar/vae.py +525 -0
src/models/skywork_unipic_dev.py +645 -0
src/models/skywork_unipic_ori.py +350 -0
src/models/skywork_unipic_siglip.py +342 -0
src/optimisers/constructor.py +64 -0
src/optimisers/custom_adamw.py +32 -0
src/runners/custom_runner.py +202 -0

src/builder.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.registry import Registry
+__all__ = ['BUILDER']
+BUILDER = Registry('builder')

src/datasets/collate_functions.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+from xtuner.utils import  IGNORE_INDEX
+from typing import Dict, Sequence
+from torch.nn.utils.rnn import pad_sequence
+from functools import partial
+from dataclasses import dataclass
+def collate_func_gen(instances: Sequence[Dict],
+                     pad_index: int = 151645):
+    pixel_values_src, pixel_values, input_ids, input_lengths = [], [], [], []
+    for example in instances:
+        # 提取图像数据
+        if 'pixel_values_src' in example:
+            pixel_values_src.append(example.pop('pixel_values_src'))
+        if 'pixel_values' in example:
+            pixel_values.append(example.pop('pixel_values'))
+        input_lengths.append(len(example['input_ids']))
+        input_ids.append(example.pop('input_ids'))
+    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=pad_index)
+    attention_mask = torch.zeros_like(input_ids).bool()
+    for i in range(len(input_ids)):
+        attention_mask[i, :input_lengths[i]] = True
+    data_dict = {
+        'input_ids': input_ids,
+        'attention_mask': attention_mask,
+    }
+    if pixel_values:
+        data_dict['pixel_values'] = torch.stack(pixel_values)
+    if pixel_values_src:
+        data_dict['pixel_values_src'] = torch.stack(pixel_values_src)
+    return {'data': data_dict, 'data_samples': None}
+def collate_func_und(instances, pad_index=151645):
+    input_ids_list, labels_list, pixel_values_list = [], [], []
+    for sample in instances:
+        input_ids_list.append(torch.LongTensor(sample['input_ids']))
+        labels_list.append(torch.LongTensor(sample['labels']))
+        if 'pixel_values' in sample:
+            pixel_values_list.append(sample['pixel_values'])
+    ori_length = [len(input_ids_) for input_ids_ in input_ids_list]
+    # right padding
+    if len(instances) > 1:
+        input_ids = pad_sequence(
+            input_ids_list, batch_first=True, padding_value=pad_index)
+        labels = pad_sequence(
+            labels_list, batch_first=True, padding_value=IGNORE_INDEX)
+    else:
+        input_ids = torch.stack(input_ids_list)
+        labels = torch.stack(labels_list)
+    attention_mask = torch.zeros_like(input_ids).bool()
+    for i, length in enumerate(ori_length):
+        attention_mask[i, :length] = True        # right padding
+    data_dict = {
+        'input_ids': input_ids,
+        'attention_mask': attention_mask,
+        'labels': labels,
+        'pixel_values': torch.stack(pixel_values_list) if len(pixel_values_list) > 0 else None
+    }
+    return {'data': data_dict, 'data_samples': None}
+class CollateConcat(object):
+    def __init__(self, collate_fns, keys):
+        self.keys = keys
+        self.collate_fns = {}
+        for key, collate_fn in zip(keys, collate_fns):
+            func = collate_fn.pop('type')
+            self.collate_fns[key] = partial(func, **collate_fn)
+    def __call__(self, data_samples):
+        data_samples = [data_sample for data_sample in data_samples if len(data_sample) > 0]
+        data_dict = {}
+        key = data_samples[0]['type']
+        data_dict[key] = self.collate_fns[key](data_samples)['data']
+        return {'data': data_dict, 'data_samples': None}

src/datasets/samplers/multi_source_sampler.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from typing import Iterator, List, Optional, Sized, Union
+import torch
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+class FixedBatchMultiSourceSampler(Sampler):
+    r"""Multi-Source Infinite Sampler.
+    According to the sampling ratio, sample data from different
+    datasets to form batches.
+    Args:
+        repeat (tuple): repeat factor
+        dataset (Sized): The dataset.
+        batch_size (int): Size of mini-batch.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed. If None, set a random seed.
+            Defaults to None.
+    """
+    def __init__(self,
+                 repeat,
+                 dataset: Sized,
+                 batch_size: int,
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+        assert hasattr(dataset, 'cumulative_sizes'),\
+            f'The dataset must be ConcatDataset, but get {dataset}'
+        assert isinstance(batch_size, int) and batch_size > 0, \
+            'batch_size must be a positive integer value, ' \
+            f'but got batch_size={batch_size}'
+        assert len(repeat) == len(dataset.cumulative_sizes), \
+            'The length of repeat must be equal to ' \
+            f'the number of datasets, but got repeat={repeat}'
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+        self.dataset = dataset
+        self.repeat = repeat
+        self.cumulative_sizes = [0] + dataset.cumulative_sizes
+        self.batch_size = batch_size
+        self.seed = sync_random_seed() if seed is None else seed
+        self.shuffle = shuffle
+        self.source2inds = {
+            source: self._indices_of_rank(len(ds))
+            for source, ds in enumerate(dataset.datasets)
+        }
+    def _infinite_indices(self, sample_size: int) -> Iterator[int]:
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(sample_size, generator=g).tolist()
+            else:
+                yield from torch.arange(sample_size).tolist()
+    def _indices_of_rank(self, sample_size: int) -> Iterator[int]:
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(
+            self._infinite_indices(sample_size), self.rank, None,
+            self.world_size)
+    def __len__(self) -> int:
+        return len(self.dataset)
+    def set_epoch(self, epoch: int) -> None:
+        """Not supported in `epoch-based runner."""
+        pass
+    def __iter__(self) -> Iterator[int]:
+        while True:
+            for source, repeat in enumerate(self.repeat):
+                for _ in range(repeat):
+                    batch_buffer_per_source = []
+                    while len(batch_buffer_per_source) < self.batch_size:
+                        idx = next(self.source2inds[source])
+                        idx += self.cumulative_sizes[source]
+                        batch_buffer_per_source.append(idx)
+                    yield from batch_buffer_per_source
+class MultiSourceSampler(Sampler):
+    def __init__(self,
+                 repeats,
+                 dataset: Sized,
+                 batch_sizes: list[int],
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+        assert hasattr(dataset, 'cumulative_sizes'),\
+            f'The dataset must be ConcatDataset, but get {dataset}'
+        assert isinstance(batch_sizes, list), \
+            f'source_ratio must be a list, but got batch_sizes={batch_sizes}'
+        assert len(batch_sizes) == len(dataset.cumulative_sizes), \
+            'The length of batch_sizes must be equal to ' \
+            f'the number of datasets, but got batch_sizes={batch_sizes}'
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+        self.dataset = dataset
+        self.cumulative_sizes = [0] + dataset.cumulative_sizes
+        self.batch_sizes = batch_sizes
+        self.seed = sync_random_seed() if seed is None else seed
+        self.shuffle = shuffle
+        self.source2inds = {
+            source: self._indices_of_rank(len(ds))
+            for source, ds in enumerate(dataset.datasets)
+        }
+        self.repeats = repeats
+        assert len(self.repeats) == len(self.batch_sizes)
+    def _infinite_indices(self, sample_size: int) -> Iterator[int]:
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(sample_size, generator=g).tolist()
+            else:
+                yield from torch.arange(sample_size).tolist()
+    def _indices_of_rank(self, sample_size: int) -> Iterator[int]:
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(
+            self._infinite_indices(sample_size), self.rank, None,
+            self.world_size)
+    def __len__(self) -> int:
+        return len(self.dataset)
+    def set_epoch(self, epoch: int) -> None:
+        """Not supported in `epoch-based runner."""
+        pass
+    def __iter__(self) -> Iterator[int]:
+        while True:
+            for source, (batch_size, repeat) in enumerate(zip(self.batch_sizes, self.repeats)):
+                for _ in range(repeat):
+                    batch_buffer_per_source = []
+                    while len(batch_buffer_per_source) < batch_size:
+                        idx = next(self.source2inds[source])
+                        idx += self.cumulative_sizes[source]
+                        batch_buffer_per_source.append(idx)
+                    yield from batch_buffer_per_source
+    @property
+    def batch_size(self):
+        batch_size_sum = sum([batch_size * repeat for batch_size, repeat in zip(self.batch_sizes, self.repeats)])
+        batch_size_ave = batch_size_sum // sum(self.repeats)
+        return batch_size_ave
+class MultiSourceBatchSampler(Sampler[list[int]]):
+    def __init__(
+        self,
+        sampler: Union[FixedBatchMultiSourceSampler, MultiSourceSampler],
+        batch_sizes: list[int],
+        repeats: list[int],
+        **kwargs
+    ) -> None:
+        self.sampler = sampler
+        self.batch_sizes = batch_sizes
+        self.repeats = repeats
+    def __iter__(self) -> Iterator[list[int]]:
+        # Implemented based on the benchmarking in https://github.com/pytorch/pytorch/pull/76951
+        sampler_iter = iter(self.sampler)
+        while True:
+            for source, (batch_size, repeat) in enumerate(zip(self.batch_sizes, self.repeats)):
+                for _ in range(repeat):
+                    batch = [*itertools.islice(sampler_iter, batch_size)]
+                    yield batch
+    @property
+    def batch_size(self):
+        batch_size_sum = sum([batch_size * repeat for batch_size, repeat in zip(self.batch_sizes, self.repeats)])
+        batch_size_ave = batch_size_sum // sum(self.repeats)
+        return batch_size_ave
+    def __len__(self) -> int:
+        return len(self.sampler) // self.batch_size

src/datasets/text2image/__init__.py ADDED Viewed

File without changes

src/datasets/text2image/text2image.py ADDED Viewed

	@@ -0,0 +1,649 @@

+from torch.utils.data import Dataset
+from PIL import Image
+import os
+import json
+import random
+import torch
+import numpy as np
+from einops import rearrange
+from xtuner.registry import BUILDER
+from mmengine.registry import DATASETS
+from src.datasets.utils import crop2square
+from glob import glob
+from typing import List, Dict, Any, Optional
+import mmap
+import struct
+from src.datasets.utils import crop2square, encode_fn
+from xtuner.utils import DEFAULT_IMAGE_TOKEN
+@BUILDER.register_module()
+class Text2ImageDataset(Dataset):
+    def __init__(self,
+                 data_path,
+                 local_folder,
+                 image_size,
+                 unconditional=0.1,
+                 tokenizer=None,
+                 prompt_template=None,
+                 max_length=1024,
+                 crop_image=True,
+                 cap_source='caption',
+                 ):
+        super().__init__()
+        self.data_path = data_path
+        self._load_data(data_path)
+        self.unconditional = unconditional
+        self.local_folder = local_folder
+        self.cap_source = cap_source
+        self.image_size = image_size
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.prompt_template = prompt_template
+        self.max_length = max_length
+        self.crop_image = crop_image
+        self.metainfo = {'task': 'unified'}
+        self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+    def _load_data(self, data_path):
+        with open(data_path, 'r') as f:
+            self.data_list = json.load(f)
+        print(f"Load {len(self.data_list)} data samples from {data_path}", flush=True)
+    def full_init(self):
+        """Dummy full_init to be compatible with MMEngine ConcatDataset."""
+        return
+    def __len__(self):
+        return len(self.data_list)
+    def _read_image(self, image_file):
+        image = Image.open(os.path.join(self.local_folder, image_file))
+        assert image.width > 8 and image.height > 8, f"Image: {image.size}"
+        assert image.width / image.height > 0.1, f"Image: {image.size}"
+        assert image.width / image.height < 10, f"Image: {image.size}"
+        return image
+    def _process_text(self, text):
+        if random.uniform(0, 1) < self.unconditional:
+            prompt = "Generate an image."
+        else:
+            prompt = f"Generate an image: {text.strip()}"
+        prompt = self.prompt_template['INSTRUCTION'].format(input=prompt)
+        input_ids = self.tokenizer.encode(prompt, add_special_tokens=True, return_tensors='pt')[0]
+        return dict(input_ids=input_ids[:self.max_length])
+    def _process_image(self, image):
+        data = dict()
+        if self.crop_image:
+            image = crop2square(image)
+        else:
+            target_size = max(image.size)
+            image = image.resize(size=(target_size, target_size))
+        image = image.resize(size=(self.image_size, self.image_size))
+        pixel_values = torch.from_numpy(np.array(image)).float()
+        pixel_values = pixel_values / 255
+        pixel_values = 2 * pixel_values - 1
+        pixel_values = rearrange(pixel_values, 'h w c -> c h w')
+        data.update(pixel_values=pixel_values)
+        return data
+    def _retry(self):
+        return self.__getitem__(random.choice(range(self.__len__())))
+    def __getitem__(self, idx):
+        try:
+            data_sample = self.data_list[idx]
+            image = self._read_image(data_sample['image']).convert('RGB')
+            caption = data_sample[self.cap_source]
+            data = self._process_image(image)
+            data.update(self._process_text(caption))
+            data.update(type='text2image')
+            return data
+        except Exception as e:
+            print(f"Error when reading {self.data_path}:{self.data_list[idx]}: {e}", flush=True)
+            return self._retry()
+@DATASETS.register_module()
+@BUILDER.register_module()
+class LargeText2ImageDataset(Text2ImageDataset):
+    # self.data_list only contains paths of images and captions
+    def __init__(self, cap_folder=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cap_folder = self.local_folder if cap_folder is None else cap_folder
+    def _load_data(self, data_path):      # image path and annotation path are saved in a json file
+        if data_path.endswith(".json"):
+            with open(data_path, 'r') as f:
+                self.data_list = json.load(f)
+        else:
+            self.data_list = []
+            json_files = glob(f'{data_path}/*.json')
+            for json_file in json_files:
+                with open(json_file, 'r') as f:
+                    self.data_list += json.load(f)
+        print(f"Load {len(self.data_list)} data samples from {data_path}", flush=True)
+    def __getitem__(self, idx):
+        try:
+            data_sample = self.data_list[idx]
+            image = self._read_image(data_sample['image']).convert('RGB')
+            with open(f"{self.cap_folder}/{data_sample['annotation']}", 'r') as f:
+                caption = json.load(f)[self.cap_source]
+            data = self._process_image(image)
+            data.update(self._process_text(caption))
+            data.update(type='text2image')
+            return data
+        except Exception as e:
+            print(f"Error when reading {self.data_path}:{data_sample}: {e}", flush=True)
+            return self._retry()
+@DATASETS.register_module()
+@BUILDER.register_module()
+class MMapT2IDataset(Dataset):
+    """
+    Map-style Text2Image Dataset with mmap-based random access.
+    一次性在 __init__ 打开 mmap；__getitem__ O(1) 读取指定行。
+    """
+    def __init__(
+        self,
+        jsonl_path: str,
+        idx_path: str,
+        image_size: int,
+        tokenizer: Optional[Dict] = None,
+        template_map_fn: Optional[Dict] = None,
+        cap_source: str = "prompt",
+        max_length: int = 2048,
+        image_length: int = 512,
+        unconditional: float = 0.01,
+        crop_image: bool = False,
+    ):
+        super().__init__()
+        # ---------- 基础参数 ----------
+        self.jsonl_path = jsonl_path
+        self.image_size = image_size
+        self.cap_source = cap_source
+        self.max_length = max_length
+        self.unconditional = unconditional
+        self.crop_image = crop_image
+        # ---------- tokenizer / template ----------
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.template_map_fn = template_map_fn
+        # ---------- mmap 加载 ----------
+        self._open_mmap(jsonl_path, idx_path)
+        self.metainfo = {'task' :'unified'}
+    # ===== mmap & index =====
+    def _open_mmap(self, jsonl_path: str, idx_path: str):
+        # mmap 文件
+        self._jsonl_fp = open(jsonl_path, "r+b")
+        self._mm = mmap.mmap(self._jsonl_fp.fileno(), 0, access=mmap.ACCESS_READ)
+        # 读取 offset 索引
+        with open(idx_path, "rb") as f:
+            nlines = struct.unpack("<Q", f.read(8))[0]
+            self._offsets = np.frombuffer(f.read(8 * nlines), dtype=np.uint64)
+        print(f"[MMapT2IDataset] {jsonl_path}: {nlines} lines indexed")
+    def __len__(self) -> int:
+        return self._offsets.size
+    def full_init(self):
+        """Dummy full_init to be compatible with MMEngine ConcatDataset."""
+        return
+    def _read_line(self, idx: int) -> str:
+        off = int(self._offsets[idx])
+        self._mm.seek(off)
+        return self._mm.readline().decode("utf-8")
+    # ===== 核心处理 =====
+    def _load_image(self, path: str) -> torch.Tensor:
+        img = Image.open(path).convert("RGB")
+        # 预处理：裁剪成方形 / pad
+        if self.crop_image:
+            img = crop2square(img)
+        else:
+            target_size = max(img.size)
+            img = img.resize((target_size, target_size))
+        img = img.resize((self.image_size, self.image_size))
+        arr = np.asarray(img, dtype=np.uint8)          # HWC uint8
+        px = torch.as_tensor(arr).float() / 255.0      # 0-1
+        px = 2 * px - 1                                # -1 ~ 1
+        return rearrange(px, "h w c -> c h w")         # CHW
+    def _build_prompt(self, caption: str) -> torch.Tensor:
+        if random.random() < self.unconditional:
+            caption = "Generate an image."
+        else:
+            caption = f"Generate an image: {caption.strip()}"
+        instr = self.template_map_fn["INSTRUCTION"].format(input=caption)
+        ids = self.tokenizer.encode(
+            instr, add_special_tokens=True, return_tensors="pt"
+        )[0][: self.max_length]
+        return ids
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        # 1) 取 jsonl 行
+        sample = json.loads(self._read_line(idx))
+        # 2) 加载 & 处理图像
+        pixel_values = self._load_image(sample["image"])
+        # 3) 处理文本
+        caption = sample.get(self.cap_source, "")
+        input_ids = self._build_prompt(caption)
+        # 4) 打包
+        data = dict(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            type="text2image",
+            image_file=sample["image"],
+            idx=idx,
+        )
+        return data
+@DATASETS.register_module()
+@BUILDER.register_module()
+class ReconstructDataset(Dataset):
+    def __init__(self,
+                 data_path: str,
+                 image_size: int,
+                 tokenizer=None,
+                 prompt_template=None,
+                 cap_source: str = "prompt",
+                 max_length: int = 8192,
+                 crop_image: bool = True,
+                 img_prefix: str = ""):
+        super().__init__()
+        self.image_size = image_size
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+        self.prompt_template = prompt_template
+        self.cap_source = cap_source
+        self.max_length = max_length
+        self.crop_image = crop_image
+        self.img_prefix = img_prefix
+        self._load_data(data_path)
+        m = n = self.image_size // 16
+        self.image_token_repeat = m * n + 64
+        self.metainfo = {'task': 'unified'}
+    def full_init(self):
+        """Dummy full_init to be compatible with MMEngine ConcatDataset."""
+        return
+    def _load_data(self, path):
+        with open(path) as f:
+            self.data_list = [json.loads(l) for l in f]
+        print(f"[I2ICaptionReconstructDataset] Loaded {len(self.data_list)} samples from {path}")
+    def _add_prefix(self, rel):
+        return os.path.join(self.img_prefix, rel.lstrip("/")) if self.img_prefix else rel
+    def _read_image(self, path):
+        img = Image.open(path).convert("RGB")
+        assert img.width > 8 and img.height > 8 and 0.1 < img.width / img.height < 10
+        return img
+    # ---------- preprocess ----------
+    def _process_image(self, img):
+        img = crop2square(img) if self.crop_image else img.resize((max(img.size),)*2)
+        img = img.resize((self.image_size, self.image_size))
+        px  = torch.from_numpy(np.array(img)).float() / 255.
+        px  = 2 * px - 1
+        return rearrange(px, "h w c -> c h w")
+    def _encode_prompt(self, text):
+        # for bad_token in ["[IMAGE]", "<image_placeholder>", "<image_plaeholder>"]:
+        #     text = text.replace(bad_token, "")
+        text = "Repeat this image."
+        prompt_in = f"<image>\n{text.strip()}"
+        prompt = self.prompt_template["INSTRUCTION"].format(input=prompt_in)
+        prompt = prompt.replace("<image>", "<image>" * self.image_token_repeat)
+        input_ids = self.tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt")[0]
+        mask = (input_ids != self.tokenizer.pad_token_id).long()
+        return input_ids[:self.max_length], mask[:self.max_length]
+    def __len__(self):
+        return len(self.data_list)
+    def _retry(self):
+        return self.__getitem__(random.randrange(len(self)))
+    def __getitem__(self, idx):
+        try:
+            sample = self.data_list[idx]
+            src_img = self._read_image(self._add_prefix(sample["image"]))
+            tgt_img = src_img
+            caption = sample[self.cap_source]
+            px_src = self._process_image(src_img)
+            px_tgt = self._process_image(tgt_img)
+            input_ids, mask = self._encode_prompt(caption)
+            return {
+                "pixel_values_src": px_src,
+                "pixel_values": px_tgt,
+                "input_ids": input_ids,
+                "attention_mask": mask,
+                "type": "image_edit"
+            }
+        except Exception as e:
+            print(f"[I2ICaptionReconstructDataset] Error @ {idx}: {e}")
+            return self._retry()
+@DATASETS.register_module()
+@BUILDER.register_module()
+class UncondReconstructDataset(Dataset):
+    def __init__(self,
+                 data_path: str,
+                 image_size: int,
+                 tokenizer=None,
+                 prompt_template=None,
+                 cap_source: str = "prompt",
+                 max_length: int = 8192,
+                 crop_image: bool = True,
+                 img_prefix: str = ""):
+        super().__init__()
+        self.image_size = image_size
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+        self.prompt_template = prompt_template
+        self.max_length = max_length
+        self.crop_image = crop_image
+        self.img_prefix = img_prefix
+        self.cap_source = cap_source
+        self._load_data(data_path)
+        # 计算 image token 展开数量
+        m = n = self.image_size // 16
+        self.image_token_repeat = m * n + 64
+        self.metainfo = {'task': 'unified'}
+    def _load_data(self, path):
+        with open(path) as f:
+            self.data_list = [json.loads(l) for l in f]
+        print(f"[I2IUncondReconstructDataset] Loaded {len(self.data_list)} samples from {path}")
+    def _add_prefix(self, rel_path):
+        return os.path.join(self.img_prefix, rel_path.lstrip("/")) if self.img_prefix else rel_path
+    def full_init(self):
+        """Dummy full_init to be compatible with MMEngine ConcatDataset."""
+        return
+    def _read_image(self, path):
+        image = Image.open(path).convert("RGB")
+        assert image.width > 8 and image.height > 8 and 0.1 < image.width / image.height < 10
+        return image
+    # ---------- preprocess ----------
+    def _process_image(self, img):
+        img = crop2square(img) if self.crop_image else img.resize((max(img.size),)*2)
+        img = img.resize((self.image_size, self.image_size))
+        px  = torch.from_numpy(np.array(img)).float() / 255.
+        px  = 2 * px - 1
+        return rearrange(px, "h w c -> c h w")
+    def __len__(self):
+        return len(self.data_list)
+    def _retry(self, max_tries=5):
+        for _ in range(max_tries):
+            try:
+                return self.__getitem__(random.randrange(len(self)))
+            except Exception:
+                continue
+        raise RuntimeError("Exceeded max retries in I2IUncondReconstructDataset")
+    def __getitem__(self, idx):
+        try:
+            sample = self.data_list[idx]
+            path = self._add_prefix(sample["image"])
+            img = self._read_image(path)
+            px = self._process_image(img)
+            # ==== 填入空文本 ====
+            input_ids = torch.zeros(0, dtype=torch.long)
+            attention_mask = torch.zeros(0, dtype=torch.long)
+            return {
+                "pixel_values_src": px,
+                "pixel_values": px.clone(),
+                "type": "image_edit",
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                # 重建任务不再输出 input_ids / attention_mask
+            }
+        except Exception as e:
+            print(f"[I2IUncondReconstructDataset] Error @ {idx}: {e}")
+            return self._retry()
+@DATASETS.register_module()
+@BUILDER.register_module()
+class Text2ImageJSONLDataset(Dataset):
+    def __init__(self,
+                 data_path,
+                 image_size,
+                 tokenizer=None,
+                 prompt_template=None,
+                 cap_source='prompt',
+                 max_length=1024,
+                 unconditional=0.1,
+                 crop_image=True,
+                 ):
+        super().__init__()
+        self.data_path = data_path
+        self._load_data(data_path)
+        self.image_size = image_size
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+        self.prompt_template = prompt_template
+        self.cap_source = cap_source
+        self.max_length = max_length
+        self.unconditional = unconditional
+        self.crop_image = crop_image
+        self.metainfo = {'task': 'unified'}
+    def _load_data(self, data_path):
+        self.data_list = []
+        with open(data_path, 'r') as f:
+            for line in f:
+                self.data_list.append(json.loads(line.strip()))
+        print(f"Loaded {len(self.data_list)} samples from {data_path}")
+    def full_init(self):
+        """Dummy full_init for MMEngine ConcatDataset compatibility."""
+        pass
+    def __len__(self):
+        return len(self.data_list)
+    def _read_image(self, image_file):
+        image = Image.open(image_file).convert('RGB')
+        assert image.width > 8 and image.height > 8
+        assert 0.1 < image.width / image.height < 10
+        return image
+    def _process_image(self, image):
+        if self.crop_image:
+            image = crop2square(image)
+        else:
+            target_size = max(image.size)
+            image = image.resize((target_size, target_size))
+        image = image.resize((self.image_size, self.image_size))
+        pixel_values = torch.from_numpy(np.array(image)).float() / 255.0
+        pixel_values = 2 * pixel_values - 1  # [-1, 1]
+        pixel_values = rearrange(pixel_values, 'h w c -> c h w')
+        return dict(pixel_values=pixel_values)
+    def _process_text(self, text):
+        if random.uniform(0, 1) < self.unconditional:
+            text = "Generate an image."
+        else:
+            text = f"Generate an image: {text.strip()}"
+        prompt = self.prompt_template['INSTRUCTION'].format(input=text)
+        input_ids = self.tokenizer.encode(prompt, add_special_tokens=True, return_tensors='pt')[0]
+        return dict(input_ids=input_ids[:self.max_length])
+    def _retry(self):
+        return self.__getitem__(random.randint(0, len(self.data_list) - 1))
+    def __getitem__(self, idx):
+        try:
+            sample = self.data_list[idx]
+            image = self._read_image(sample['image'])
+            caption = sample[self.cap_source]
+            data = self._process_image(image)
+            data.update(self._process_text(caption))
+            data.update(type='text2image')
+            return data
+        except Exception as e:
+            print(f"[JSONLDataset] Error reading sample #{idx}: {e}")
+            return self._retry()
+# 纯文生图没有占位符的问题，下面编辑数据集需要考虑占位符
+@DATASETS.register_module()
+@BUILDER.register_module()
+class ImageEditJSONLDataset(Dataset):
+    """
+    Dataset for <src, tgt, prompt> image editing, now decoupled from tokenization logic.
+    """
+    def __init__(self,
+                 data_path: str,
+                 image_size: int,
+                 tokenizer=None,
+                 prompt_template=None,
+                 max_length: int = 8192,
+                 cap_source: str = "prompt",
+                 unconditional: float = 0,
+                 crop_image: bool = False,
+                 img_prefix: str = ""):
+        super().__init__()
+        self.data_path = data_path
+        self.image_size = image_size
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.prompt_template = prompt_template
+        self.max_length = max_length
+        self.cap_source = cap_source
+        self.unconditional = unconditional
+        self.crop_image = crop_image
+        self.img_prefix = img_prefix
+        self._load_data(data_path)
+        # Calculate image token repetition length, consistent with inference.
+        m = n = self.image_size // 16
+        self.image_token_repeat = m * n + 64
+        self.metainfo = {'task': 'unified'}
+        self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+        self.image_token_idx = self.tokenizer.convert_tokens_to_ids("<image>")
+        print(f"Registered <image> token at index {self.image_token_idx}")
+    def _load_data(self, path):
+        with open(path) as f:
+            self.data_list = [json.loads(l) for l in f]
+        print(f"[ImageEditJSONLDataset] Loaded {len(self.data_list)} samples from {path}")
+    def full_init(self):
+        """Dummy full_init for MMEngine ConcatDataset compatibility."""
+        pass
+    def _add_prefix(self, rel_path):
+        return os.path.join(self.img_prefix, rel_path.lstrip("/")) if self.img_prefix else rel_path
+    def _read_image(self, path):
+        path = path.replace("datasets_vlm02", "datasets_vlm")
+        img = Image.open(path).convert("RGB")
+        assert img.width > 8 and img.height > 8 and 0.1 < img.width / img.height < 10
+        return img
+    def _process_image(self, img):
+        img = crop2square(img) if self.crop_image else img.resize((max(img.size),) * 2)
+        img = img.resize((self.image_size, self.image_size))
+        px = torch.from_numpy(np.array(img)).float() / 255.
+        px = 2 * px - 1
+        return rearrange(px, "h w c -> c h w")
+    # --- REFACTORED: This method now only prepares the raw prompt text ---
+    def _prepare_prompt_text(self, raw_text: str):
+        """Cleans text and handles unconditional generation."""
+        for bad_token in ["[IMAGE]", "<image_placeholder>", "<image_plaeholder>", "<image>"]:
+            txt = raw_text.replace(bad_token, "")
+        txt = txt.strip()
+        if random.random() < self.unconditional:
+            txt = "Edit this image."
+        return txt
+    def _retry(self):
+        return self.__getitem__(random.randrange(len(self)))
+    def __len__(self):
+        return len(self.data_list)
+    def __getitem__(self, idx):
+        try:
+            sample = self.data_list[idx]
+            src_path, tgt_path = map(self._add_prefix, [sample["images"][0], sample["image"]])
+            src_img, tgt_img = map(self._read_image, [src_path, tgt_path])
+            px_src, px_tgt = map(self._process_image, [src_img, tgt_img])
+            # --- MODIFIED: Call the unified encode_fn ---
+            # 1. Prepare the raw prompt string
+            prompt_text = self._prepare_prompt_text(sample[self.cap_source])
+            # 2. Delegate all encoding and formatting to encode_fn
+            encoded_text = encode_fn(
+                example=prompt_text,
+                tokenizer=self.tokenizer,
+                prompt_template=self.prompt_template,
+                max_length=self.max_length,
+                image_length=self.image_token_repeat,
+                image_token_idx=self.image_token_idx
+            )
+            return {
+                    "pixel_values_src": px_src,
+                    "pixel_values": px_tgt,
+                    "input_ids": torch.tensor(encoded_text["input_ids"], dtype=torch.long),
+                    "attention_mask": torch.tensor(encoded_text["attention_mask"], dtype=torch.long),
+                    "type": "image_edit",
+                }
+        except Exception as e:
+            print(f"[ImageEditJSONLDataset] Error @ {idx}: {e} from {self.data_path}")
+            return self._retry()

src/datasets/understanding/caption_datasets.py ADDED Viewed

	@@ -0,0 +1,342 @@

+from torch.utils.data import Dataset
+from PIL import Image
+import os
+import io
+import json
+import random
+import torch
+import numpy as np
+from einops import rearrange
+try:
+    from aoss_client.client import Client
+except:
+    try:
+        from petrel_client.client import Client
+    except:
+        Client = None
+from glob import glob
+from xtuner.registry import BUILDER
+from xtuner.dataset.utils import expand2square
+from src.datasets.utils import crop2square, encode_fn
+from xtuner.utils import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+from src.datasets.understanding.caption_prompts import dense_prompts, short_prompts
+from typing import List, Dict, Any, Optional,Callable,Tuple
+@BUILDER.register_module()
+class CaptionDataset(Dataset):
+    def __init__(self,
+                 data_path,
+                 local_folder,
+                 image_size,
+                 ceph_folder=None,
+                 ceph_config=None,
+                 tokenizer=None,
+                 template_map_fn=None,
+                 max_length=2048,
+                 min_image_size=80,
+                 image_length=256,
+                 pad_image=True,
+                 brief=False,
+                 cap_folder=None,
+                 cap_source='caption',
+                 ):
+        super().__init__()
+        self.data_path = data_path
+        self._load_data(data_path)
+        self.local_folder = local_folder
+        self.cap_folder = local_folder if cap_folder is None else cap_folder
+        self.cap_source = cap_source
+        self.image_size = image_size
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.prompt_template = template_map_fn['template']
+        self.template_map_fn = BUILDER.build(template_map_fn)
+        self.max_length = max_length
+        self.image_length = image_length
+        self.pad_image = pad_image
+        self.min_image_size = min_image_size
+        self.FILE_CLIENT = None
+        self.ceph_folder = ceph_folder
+        self.ceph_config = ceph_config
+        self.use_ceph = ((Client is not None) and (ceph_folder is not None)
+                         and (ceph_config is not None) and os.path.exists(ceph_config))
+        self.brief = brief
+        self.caption_prompts = short_prompts if self.brief else dense_prompts
+    def _load_data(self, data_path: str):      # image path and annotation path are saved in a json file
+        if data_path.endswith('.json'):
+            with open(data_path, 'r') as f:
+                self.data_list = json.load(f)
+        else:
+            json_files = glob(f"{data_path}/*.json")
+            data_list = []
+            for json_file in json_files:
+                with open(json_file, 'r') as f:
+                    data_list += json.load(f)
+            self.data_list = data_list
+        print(f"Load {len(self.data_list)} data samples from {data_path}", flush=True)
+    def __len__(self):
+        return len(self.data_list)
+    def _read_ceph(self, ceph_path):
+        if self.FILE_CLIENT is None:
+            self.FILE_CLIENT = Client(self.ceph_config)
+        data_bytes = self.FILE_CLIENT.get(ceph_path)
+        return io.BytesIO(data_bytes)
+    def _read_image(self, image_file):
+        if self.use_ceph:
+            image = Image.open(
+                self._read_ceph(
+                    os.path.join(self.ceph_folder, image_file)
+                )
+            )
+        else:
+            image = Image.open(
+                os.path.join(self.local_folder, image_file)
+            )
+        assert image.width > self.min_image_size and image.height > self.min_image_size, f"Image: {image.size}"
+        assert image.width / image.height > 0.1, f"Image: {image.size}"
+        assert image.width / image.height < 10, f"Image: {image.size}"
+        return image.convert('RGB')
+    def _read_json(self, annotation_file):
+        if self.use_ceph:
+            annotation = json.load(
+                self._read_ceph(
+                    os.path.join(self.ceph_folder, annotation_file)
+                )
+            )
+        else:
+            with open(os.path.join(self.local_folder, annotation_file), 'r') as f:
+                annotation = json.load(f)
+        return annotation
+    def _process_image(self, image):
+        data = dict()
+        if self.pad_image:
+            image = expand2square(image, (127, 127, 127))
+        else:
+            image = crop2square(image)
+        image = image.resize(size=(self.image_size, self.image_size))
+        pixel_values = torch.from_numpy(np.array(image)).float()
+        pixel_values = pixel_values / 255
+        pixel_values = 2 * pixel_values - 1
+        pixel_values = rearrange(pixel_values, 'h w c -> c h w')
+        data.update(pixel_values=pixel_values)
+        return data
+    def _process_text(self, text):
+        assert DEFAULT_IMAGE_TOKEN not in text, text
+        data_dict = dict(conversation=[{'input': f"{DEFAULT_IMAGE_TOKEN}\n{random.choice(self.caption_prompts)}",
+                                        'output': text.strip()}])
+        data_dict.update(self.template_map_fn(data_dict))
+        data_dict.update(encode_fn(data_dict, self.tokenizer, self.max_length,
+                                   self.image_length, True, True))
+        assert (torch.tensor(data_dict['input_ids']).long() == IMAGE_TOKEN_INDEX).sum() == self.image_length, \
+            "Error in image format"
+        data_dict['type'] = 'image2text'
+        return data_dict
+    def _retry(self):
+        return self.__getitem__(random.choice(range(self.__len__())))
+    def __getitem__(self, idx):
+        try:
+            data_sample = self.data_list[idx]
+            image = self._read_image(data_sample['image']).convert('RGB')
+            data = self._process_image(image)
+            del image
+            with open(f"{self.cap_folder}/{data_sample['annotation']}", 'r') as f:
+                caption = json.load(f)[self.cap_source]
+            data.update(self._process_text(caption))
+            data.update(image_dir=self.local_folder, image_file=data_sample['image'])
+            return data
+        except Exception as e:
+            print(f"Error when reading {self.data_path}:{data_sample['image']}: {e}", flush=True)
+            return self._retry()
+@BUILDER.register_module()
+class VqaDataset(Dataset):
+    """Generic VQA / multimodal conversation dataset with robust IO & validation."""
+    # ---------- 初始化 ----------
+    def __init__(
+        self,
+        data_path: str,
+        tokenizer,                      # ← 必填参数，放在最前
+        template_map_fn: Callable,      # ← 必填参数，放在最前
+        img_prefix: Optional[str] = None,
+        image_size: int = 512,
+        max_length: int = 2048,
+        image_length: int = 1089,
+        pad_image: bool = True,
+        min_image_size: int = 80,
+        image_token_patterns: Tuple[str, ...] = ('<image>', '[image]', '<img>'),
+        max_retry: int = 5,
+    ):
+        super().__init__()
+        self.img_prefix = img_prefix.rstrip("/") if img_prefix else None
+        self.image_size = image_size
+        self.max_length = max_length
+        self.image_length = image_length
+        self.pad_image = pad_image
+        self.min_image_size = min_image_size
+        self.image_token_patterns = list(image_token_patterns)
+        self.max_retry = max_retry
+        # 构建 tokenizer 与模板
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.template_map_fn = BUILDER.build(template_map_fn) if template_map_fn else None
+        # 读取 jsonl / 目录
+        self.data_list = self._load_jsonl_list(data_path)
+        print(f"Loaded {len(self.data_list)} samples from {data_path}")
+    # ---------- 数据加载辅助 ----------
+    @staticmethod
+    def _load_jsonl_list(path: str) -> List[Dict[str, Any]]:
+        data: List[Dict[str, Any]] = []
+        if path.endswith(".jsonl"):
+            files = [path]
+        else:
+            files = sorted(glob(os.path.join(path, "**/*.jsonl"), recursive=True))
+        for file in files:
+            with open(file, "r") as f:
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        data.append(json.loads(line))
+        return data
+    # ---------- 基本接口 ----------
+    def __len__(self) -> int:
+        return len(self.data_list)
+    # ---------- 图像处理 ----------
+    def _get_image_path(self, img_file: str) -> str:
+        """保持绝对路径不变，否则加前缀"""
+        return img_file if os.path.isabs(img_file) else os.path.join(self.img_prefix, img_file)
+    def _read_image(self, img_file: str) -> Image.Image:
+        img_path = self._get_image_path(img_file)
+        try:
+            image = Image.open(img_path).convert("RGB")
+        except Exception as e:
+            raise FileNotFoundError(f"Cannot open image: {img_path} ({e})")
+        w, h = image.size
+        if w < self.min_image_size or h < self.min_image_size:
+            raise ValueError(f"Image too small: {img_path} ({w}x{h})")
+        ratio = w / h
+        if not (0.1 < ratio < 10):
+            raise ValueError(f"Odd aspect ratio ({ratio:.3f}) for {img_path}")
+        # pad / crop
+        image = expand2square(image, (127, 127, 127)) if self.pad_image else crop2square(image)
+        image = image.resize((self.image_size, self.image_size), resample=Image.BICUBIC)
+        px = torch.from_numpy(np.asarray(image)).float() / 255.0
+        px = 2 * px - 1.0
+        px = rearrange(px, "h w c -> c h w")  # CHW
+        return px
+    # ---------- 对话处理 ----------
+    def _replace_image_tokens(self, txt: str) -> str:
+        for pat in self.image_token_patterns:
+            if pat in txt:
+                txt = txt.replace(pat, str(self.image_token_idx))
+        return txt
+    def _format_conversation(self, turns: List[Dict[str, str]]) -> Dict[str, Any]:
+        """
+        将多个 human/gpt 轮次合并为若干 {'input':..., 'output':...} 对。
+        遵循：human → gpt 为一对；若缺失 reply，用占位符。
+        """
+        pairs = []
+        for i in range(0, len(turns), 2):  # 每两回合一对，human 和 gpt
+            if i + 1 < len(turns):  # 确保 gpt turn 存在
+                human_turn = turns[i]
+                gpt_turn = turns[i + 1]
+                human_content = human_turn.get("value", "").strip()
+                gpt_content = gpt_turn.get("value", "").strip()
+                if not human_content.lstrip().startswith("<image>"):
+                    human_content = f"<image>\n{human_content}"
+                if not human_content or not gpt_content:  # 如果某一方没有内容，跳过该对话
+                    continue
+                # 只在 human turn 中加入图像 token
+                # human_content = self._replace_image_tokens(human_content)  # 替换成 image_token_idx
+                pairs.append({"input": human_content, "output": gpt_content})
+        data_dict = {"conversation": pairs}
+        data_dict_ori = data_dict
+        if self.template_map_fn:
+            data_dict = self.template_map_fn(data_dict)
+        # 对输入进行编码
+        data_dict = encode_fn(
+            data_dict,
+            self.tokenizer,
+            self.max_length,
+            self.image_length,
+            input_ids_with_output=True,
+            with_image_token=True,
+            # 额外把 image_token_idx 传进去
+            image_token_idx=self.image_token_idx
+        )
+        # 动态校验：确保至少出现一次图像 token
+        img_tokens = (torch.tensor(data_dict["input_ids"]) == self.image_token_idx).sum().item()
+        # 使用f-string优化打印格式，确保输出类型安全
+        print(f"[校验日志] input_ids长度: {len(data_dict['input_ids'])}, 图像token出现次数: {img_tokens}\n")
+        # print(f"[校验日志] input_ids: {data_dict.get('input_ids', '未设置')}\n")
+        if img_tokens != 1088:
+            print(f"[异常对话]:{data_dict_ori}")
+        data_dict["type"] = "image2text"  # 设置数据类型为 image2text
+        return data_dict
+    # ---------- 主接口 ----------
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        for attempt in range(self.max_retry):
+            try:
+                sample = self.data_list[idx]
+                img_tensor = self._read_image(sample["image"])
+                text_data = self._format_conversation(sample.get("conversations", []))
+                return {
+                    **text_data,
+                    "pixel_values": img_tensor,
+                    "image_file": sample["image"],
+                }
+            except Exception as e:
+                print(f"[Retry {attempt+1}/{self.max_retry}] idx={idx} error: {e}")
+                idx = random.randint(0, len(self) - 1)
+        # 若多次失败则抛异常
+        raise RuntimeError(f"Failed to fetch valid sample after {self.max_retry} retries.")

src/datasets/understanding/caption_prompts.py ADDED Viewed

	@@ -0,0 +1,28 @@

+dense_prompts = [
+    "Describe the image in detail.",
+    "Provide a comprehensive description of everything you see in the picture.",
+    "Explain the scene depicted in the image as if you were describing it to someone who cannot see it.",
+    "List all the objects and activities taking place in this image.",
+    "What is the story being told by this image? Describe in detail.",
+    "Imagine you are giving a detailed tour of the image's scene. How would you describe it?",
+    "Describe the foreground, background, and any notable features of the image.",
+    "How would you describe this image to build a replica of the scene?",
+    "Write a paragraph detailing the setting, characters, and actions visible in this image.",
+    "Describe every aspect of the image, including the environment, objects, and any people present.",
+    "Provide a detailed analysis of the composition and elements of the image.",
+    "What are the main focal points of this image? Describe them in detail.",
+    "Catalog all visible elements in the image and describe their significance to the overall scene."
+  ]
+short_prompts = [
+    "Briefly describe the image",
+    "Summarize the key elements of the image in one sentence.",
+    "Give a concise description of the scene.",
+    "Briefly, what is happening in this image?",
+    "What is the most noticeable feature of this image?",
+    "Summarize the image in a sentence.",
+    "What activity is being depicted in the image?",
+    "Describe the setting of the image in a few words.",
+    "Caption this image for a social media post."
+  ]

src/datasets/understanding/vlm_datasets_sig.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from torch.utils.data import Dataset
+from PIL import Image
+import os
+import json
+import random
+import torch
+import numpy as np
+from einops import rearrange
+from xtuner.registry import BUILDER
+from xtuner.dataset.utils import expand2square
+from src.datasets.utils import crop2square, encode_fn, load_jsonl
+from xtuner.utils import DEFAULT_IMAGE_TOKEN
+from transformers import AutoImageProcessor
+class VLMDataset(Dataset):
+    def __init__(
+        self,
+        data_path,
+        image_size,
+        tokenizer=None,
+        template_map_fn=None,
+        max_length=2048,
+        min_image_size=80,
+        pad_image=True,
+        local_folder="",
+        key_value="conversations",
+    ):
+        super().__init__()
+        self.data_path = data_path
+        self._load_data(data_path)
+        self.image_size = image_size
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.prompt_template = template_map_fn["template"]
+        self.template_map_fn = BUILDER.build(template_map_fn)
+        self.max_length = max_length
+        self.pad_image = pad_image
+        self.min_image_size = min_image_size
+        self.key_value = key_value
+        self.processor = AutoImageProcessor.from_pretrained(
+            "checkpoint/siglip2-so400m-patch16-512"
+        )
+        self.metainfo = {'task' :'unified'}
+        self.DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN
+        m = n = self.image_size // 16
+        self.image_token_repeat = m * n + 64
+        self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+        self.image_token_idx = self.tokenizer.convert_tokens_to_ids("<image>")
+        print(f"Registered <image> token at index {self.image_token_idx}")
+    def _load_data(
+        self, data_path: str
+    ):  # image path and annotation path are saved in a json file
+        self.data_list = load_jsonl(data_path)
+        print(f"Load {len(self.data_list)} data samples from {data_path}", flush=True)
+    def full_init(self):
+        """Dummy full_init to be compatible with MMEngine ConcatDataset."""
+        return
+    def __len__(self):
+        return len(self.data_list)
+    def _read_image(self, image_file):
+        image = Image.open(image_file)
+        assert (
+            image.width > self.min_image_size and image.height > self.min_image_size
+        ), f"Image: {image.size}"
+        assert image.width / image.height > 0.1, f"Image: {image.size}"
+        assert image.width / image.height < 10, f"Image: {image.size}"
+        return image.convert("RGB")
+    # def _process_image(self, image):
+    #     data = dict()
+    #     # if self.pad_image:
+    #     #     image = expand2square(image, (127, 127, 127))
+    #     # else:
+    #     #     image = crop2square(image)
+    #     # image = image.resize(size=(self.image_size, self.image_size))
+    #     # pixel_values = torch.from_numpy(np.array(image)).float()
+    #     # pixel_values = pixel_values / 255
+    #     # pixel_values = 2 * pixel_values - 1
+    #     # pixel_values = rearrange(pixel_values, "h w c -> c h w")
+    #     image = image.resize((self.image_size, self.image_size))
+    #     inputs = self.processor(images=image, return_tensors="pt")
+    #     pixel_values = inputs["pixel_values"].squeeze(0)
+    #     data.update(pixel_values=pixel_values)
+    #     return data
+    def _process_image(self, image: Image.Image):
+        # 1) 可选 crop/pad to square
+        if self.pad_image:
+            image = crop2square(image)
+        # 2) 手动 resize 到指定大小
+        image = image.resize((self.image_size, self.image_size))
+        # 3) to tensor & normalize
+        arr = np.array(image).astype(np.float32) / 255.0          # HWC
+        arr = 2 * arr - 1                                         # [-1,1]
+        tensor = torch.from_numpy(arr)                            # HWC
+        tensor = rearrange(tensor, "h w c -> c h w")             # CHW
+        return {"pixel_values": tensor}
+    def _process_text(self, question, answer):
+        data_dict = dict(
+            conversation=[
+                {
+                    "input": f"{self.DEFAULT_IMAGE_TOKEN}\n{question}",
+                    "output": answer,
+                }
+            ]
+        )
+        data_dict.update(self.template_map_fn(data_dict))
+        data_dict.update(
+            encode_fn(
+                example=data_dict,
+                tokenizer=self.tokenizer,
+                max_length=self.max_length,
+                image_length=self.image_token_repeat,
+                input_ids_with_output=True,
+                with_image_token=True,
+                truncation='right',
+                image_token_idx=self.image_token_idx,
+                image_token_str=self.DEFAULT_IMAGE_TOKEN,
+            )
+        )
+        # assert (
+        #     torch.tensor(data_dict["input_ids"]).long() == self.image_token_idx
+        # ).sum() == self.image_length, "Error in image format"
+        data_dict["type"] = "image2text"
+        return data_dict
+    def _retry(self):
+        return self.__getitem__(random.choice(range(self.__len__())))
+    def __getitem__(self, idx):
+        try:
+            data_sample = self.data_list[idx]
+            image = self._read_image(data_sample["image"]).convert("RGB")
+            data = self._process_image(image)
+            del image
+            question = (
+                data_sample[self.key_value][0]["value"]
+                .replace("<image>", "")
+                .strip()
+            )
+            answer = (
+                data_sample[self.key_value][1]["value"]
+                .replace("<image>", "")
+                .strip()
+            )
+            data.update(self._process_text(question, answer))
+            data.update(image_file=data_sample["image"])
+            return data
+        except Exception as e:
+            print(
+                f"Error when reading data_sample:{data_sample},{self.data_path}:{data_sample['image']}: {e}",
+                flush=True,
+            )
+            return self._retry()

src/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import copy
+import random
+from xtuner.dataset.utils import get_bos_eos_token_ids
+from xtuner.utils import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+import json
+# def crop2square(pil_img):
+#     width, height = pil_img.width, pil_img.height
+#     if width > height:
+#         y0, y1 = 0, height
+#         x0 = random.randint(0, width - height)    # [0, w - h]
+#         x1 = x0 + height    # [h, w]
+#     else:
+#         x0, x1 = 0, width
+#         y0 = random.randint(0, height - width)   # [0, h - w]
+#         y1 = y0 + width     # [w, h]
+#     return pil_img.crop(box=(x0, y0, x1, y1))
+def crop2square(pil_img):
+    width, height = pil_img.width, pil_img.height
+    short = min(width, height)
+    left = (width - short) // 2
+    upper = (height - short) // 2
+    return pil_img.crop((left, upper, left + short, upper + short))
+def load_jsonl(json_file):
+    with open(json_file) as f:
+        lines = f.readlines()
+    data = []
+    for line in lines:
+        data.append(json.loads(line))
+    return data
+def encode_fn_original(example,
+              tokenizer,
+              max_length=None,
+              image_length=1,
+              input_ids_with_output=True,
+              with_image_token=False,
+              truncation='right',
+              image_token_idx=None,
+              image_token_str="<image>"):
+    """We only support the following three scenarios:
+    1. Incremental pretraining dataset.
+        example['conversation'] = [
+                {
+                    'input': '',
+                    'output': '### Human: Can you write xxx'
+                }
+            ]
+    2. Single-turn conversation dataset.
+        example['conversation'] = [
+                {
+                    'input': 'Give three tips for staying healthy.',
+                    'output': '1.Eat a balanced diet xxx'
+                }
+            ]
+    3. Multi-turn conversation dataset.
+        example['conversation'] = [
+                {
+                    'input': 'Give three tips for staying healthy.',
+                    'output': '1.Eat a balanced diet xxx'
+                },
+                {
+                    'input': 'Please expand on the second point.',
+                    'output': 'Here is an expanded explanation of the xxx'
+                }
+            ]
+    """
+    bos_token_id, eos_token_id = get_bos_eos_token_ids(tokenizer)
+    if image_token_idx is None:       # 如果没传，就退回库常量
+        image_token_idx = tokenizer.convert_tokens_to_ids("<image>")
+    is_multi_turn_conversation = len(example['conversation']) > 1
+    if is_multi_turn_conversation:
+        assert input_ids_with_output
+    input_ids, labels = [], []
+    next_needs_bos_token = True
+    for single_turn_conversation in example['conversation']:
+        input = single_turn_conversation['input']
+        if image_token_str in input and with_image_token:
+            chunk_encode = [
+                tokenizer.encode(chunk, add_special_tokens=False)
+                for chunk in input.split(image_token_str)
+            ]
+            assert len(chunk_encode) == 2
+            input_encode = []
+            for idx, cur_chunk_encode in enumerate(chunk_encode):
+                input_encode.extend(cur_chunk_encode)
+                if idx != len(chunk_encode) - 1:
+                    # input_encode.append(IMAGE_TOKEN_INDEX)
+                    input_encode += [image_token_idx] * image_length
+        else:
+            input_encode = tokenizer.encode(input, add_special_tokens=False)
+        if next_needs_bos_token:
+            input_ids += bos_token_id
+            labels += [IGNORE_INDEX] * len(bos_token_id)
+        input_ids += input_encode
+        labels += [IGNORE_INDEX] * len(input_encode)
+        if input_ids_with_output and 'output' in single_turn_conversation:
+            # Add output
+            output_with_loss = single_turn_conversation.get(
+                'output_with_loss', True)
+            output = single_turn_conversation['output']
+            if image_token_str in output and with_image_token:
+                chunk_encode = [
+                    tokenizer.encode(chunk, add_special_tokens=False)
+                    for chunk in output.split(image_token_str)
+                ]
+                assert len(chunk_encode) == 2
+                output_encode = []
+                for idx, cur_chunk_encode in enumerate(chunk_encode):
+                    output_encode.extend(cur_chunk_encode)
+                    if idx != len(chunk_encode) - 1:
+                        output_encode += [image_token_idx] * image_length
+            else:
+                output_encode = tokenizer.encode(output, add_special_tokens=False)
+            # output_encode = tokenizer.encode(output, add_special_tokens=False)
+            input_ids += output_encode
+            if output_with_loss:
+                labels += copy.deepcopy(output_encode)
+            else:
+                labels += [IGNORE_INDEX] * len(output_encode)
+            # Add EOS_TOKEN (with loss)
+            if single_turn_conversation.get('need_eos_token', True):
+                next_needs_bos_token = True
+                input_ids += eos_token_id
+                if output_with_loss:
+                    labels += copy.deepcopy(eos_token_id)
+                else:
+                    labels += [IGNORE_INDEX] * len(eos_token_id)
+            else:
+                next_needs_bos_token = False
+            # Add SEP (without loss)
+            sep = single_turn_conversation.get('sep', '')
+            if sep != '':
+                sep_encode = tokenizer.encode(sep, add_special_tokens=False)
+                input_ids += sep_encode
+                labels += [IGNORE_INDEX] * len(sep_encode)
+    if max_length is not None and len(input_ids) > max_length:
+        if truncation == 'right':
+            input_ids = input_ids[:max_length]
+            labels = labels[:max_length]
+        elif truncation == 'left':
+            input_ids = input_ids[-max_length:]
+            labels = labels[-max_length:]
+        else:
+            assert truncation is None
+    return {'input_ids': input_ids, 'labels': labels}
+def encode_fn(
+    example,
+    tokenizer,
+    prompt_template=None,
+    max_length=None,
+    image_length=1,
+    input_ids_with_output=True,
+    with_image_token=True,
+    truncation='right',
+    image_token_idx=None,
+    image_token_str="<image>",
+):
+    """
+    A versatile encoding function for both image-to-text (conversation) and text-to-image/image-editing tasks.
+    - Image-to-Text: example = {"conversation": [...]}, outputs input_ids + labels.
+    - Text-to-Image/Editing: example = str (raw_text prompt), outputs input_ids + labels (with IGNORE_INDEX).
+    """
+    # assert image_token_idx is not None, "Must pass image_token_idx explicitly"
+    # print(f"[DEBUG] image_token_idx = {image_token_idx}")
+    if image_token_idx is None:
+        tokenizer.add_tokens([image_token_str], special_tokens=True)
+        image_token_idx = tokenizer.convert_tokens_to_ids(image_token_str)
+    if isinstance(example, str):
+        assert prompt_template is not None, \
+            "prompt_template 不能为空（text2image/image-editing）"
+        # 1) 构造 prompt
+        #    直接在最前面加一个 <image> token，
+        #    然后空一行，再拼原始文本
+        prompt = f"{example.strip()}"
+        # 用模板包装
+        prompt = prompt_template["INSTRUCTION"].format(input=prompt)
+        # 2) 用 tokenizer 编码（不要让 tokenizer 把 <image> 当成普通字符切分）
+        #    一种简单做法：先去掉 tokenizer 里的特殊 token，再手动拼接
+        text_ids = tokenizer.encode(
+            prompt,
+            add_special_tokens=False,
+            truncation=True,
+            max_length=(max_length - image_length) if max_length else None
+        )
+        # 把 <image> token id 插到最前面（或者你想要的位置）
+        input_ids = [image_token_idx] * image_length + text_ids
+        # 3) 如果超长，直接截断
+        if max_length is not None and len(input_ids) > max_length:
+            input_ids = input_ids[:max_length]
+        # 4) attention_mask
+        attention_mask = [1] * len(input_ids)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+    # --- Image-to-text task: multi-turn conversation structure ---
+    assert isinstance(example, dict) and "conversation" in example
+    bos_token_id, eos_token_id = get_bos_eos_token_ids(tokenizer)
+    is_multi_turn = len(example["conversation"]) > 1
+    if is_multi_turn:
+        assert input_ids_with_output
+    input_ids, labels = [], []
+    next_needs_bos_token = True
+    for single_turn in example["conversation"]:
+        input_text = single_turn["input"]
+        # ==== Encode input ====
+        if with_image_token and image_token_str in input_text:
+            chunks = input_text.split(image_token_str)
+            chunk_encoded = [tokenizer.encode(c, add_special_tokens=False) for c in chunks]
+            assert len(chunk_encoded) >= 2
+            input_encode = []
+            for i, chunk in enumerate(chunk_encoded):
+                input_encode.extend(chunk)
+                if i < len(chunk_encoded) - 1:
+                    input_encode.extend([image_token_idx] * image_length)
+        else:
+            input_encode = tokenizer.encode(input_text, add_special_tokens=False)
+        if next_needs_bos_token:
+            input_ids.extend(bos_token_id)
+            labels.extend([IGNORE_INDEX] * len(bos_token_id))
+        input_ids.extend(input_encode)
+        labels.extend([IGNORE_INDEX] * len(input_encode))
+        # ==== Encode output ====
+        if input_ids_with_output and "output" in single_turn:
+            output = single_turn["output"]
+            output_with_loss = single_turn.get("output_with_loss", True)
+            if with_image_token and image_token_str in output:
+                chunks = output.split(image_token_str)
+                chunk_encoded = [tokenizer.encode(c, add_special_tokens=False) for c in chunks]
+                assert len(chunk_encoded) >= 2
+                output_encode = []
+                for i, chunk in enumerate(chunk_encoded):
+                    output_encode.extend(chunk)
+                    if i < len(chunk_encoded) - 1:
+                        output_encode.extend([image_token_idx] * image_length)
+            else:
+                output_encode = tokenizer.encode(output, add_special_tokens=False)
+            input_ids.extend(output_encode)
+            if output_with_loss:
+                labels.extend(output_encode.copy())
+            else:
+                labels.extend([IGNORE_INDEX] * len(output_encode))
+            # ==== Append EOS ====
+            if single_turn.get("need_eos_token", True):
+                next_needs_bos_token = True
+                input_ids.extend(eos_token_id)
+                if output_with_loss:
+                    labels.extend(eos_token_id.copy())
+                else:
+                    labels.extend([IGNORE_INDEX] * len(eos_token_id))
+            else:
+                next_needs_bos_token = False
+            # ==== Append separator ====
+            sep = single_turn.get("sep", "")
+            if sep:
+                sep_encoded = tokenizer.encode(sep, add_special_tokens=False)
+                input_ids.extend(sep_encoded)
+                labels.extend([IGNORE_INDEX] * len(sep_encoded))
+    # ==== Truncation ====
+    if max_length is not None and len(input_ids) > max_length:
+        if truncation == "right":
+            input_ids = input_ids[:max_length]
+            labels = labels[:max_length]
+        elif truncation == "left":
+            input_ids = input_ids[-max_length:]
+            labels = labels[-max_length:]
+        else:
+            raise ValueError("truncation must be 'left', 'right', or None")
+    return {"input_ids": input_ids, "labels": labels}

src/models/mar/decoder.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from timm.models.vision_transformer import Block
+from functools import partial
+class MARDecoder(nn.Module):
+    """ Masked Autoencoder with VisionTransformer backbone
+    """
+    def __init__(self, img_size=256, vae_stride=16,
+                 patch_size=1,
+                 # encoder_embed_dim=1024,
+                 decoder_embed_dim=1024, decoder_depth=16, decoder_num_heads=16,
+                 mlp_ratio=4.,
+                 attn_dropout=0.1,
+                 proj_dropout=0.1,
+                 buffer_size=64,
+                 grad_checkpointing=False,
+                 ):
+        super().__init__()
+        # --------------------------------------------------------------------------
+        # VAE
+        self.img_size = img_size
+        self.vae_stride = vae_stride
+        self.seq_h = self.seq_w = img_size // vae_stride // patch_size
+        self.seq_len = self.seq_h * self.seq_w
+        self.grad_checkpointing = grad_checkpointing
+        # --------------------------------------------------------------------------
+        # MAR decoder specifics
+        self.buffer_size = buffer_size
+        # self.decoder_embed = nn.Linear(encoder_embed_dim, decoder_embed_dim, bias=True)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        self.decoder_pos_embed_learned = nn.Parameter(torch.zeros(1, self.seq_len + self.buffer_size, decoder_embed_dim))
+        self.decoder_blocks = nn.ModuleList([
+            Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True,
+                  norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                  proj_drop=proj_dropout, attn_drop=attn_dropout) for _ in range(decoder_depth)])
+        self.decoder_norm = nn.LayerNorm(decoder_embed_dim, eps=1e-6)
+        self.diffusion_pos_embed_learned = nn.Parameter(torch.zeros(1, self.seq_len, decoder_embed_dim))
+        self.initialize_weights()
+    def initialize_weights(self):
+        # parameters
+        torch.nn.init.normal_(self.mask_token, std=.02)
+        torch.nn.init.normal_(self.decoder_pos_embed_learned, std=.02)
+        torch.nn.init.normal_(self.diffusion_pos_embed_learned, std=.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+            if m.weight is not None:
+                nn.init.constant_(m.weight, 1.0)
+    def forward(self, x, mask):
+        # x = self.decoder_embed(x)
+        mask_with_buffer = torch.cat([torch.zeros(x.size(0), self.buffer_size, device=x.device), mask], dim=1)
+        # pad mask tokens
+        mask_tokens = self.mask_token.repeat(mask_with_buffer.shape[0], mask_with_buffer.shape[1], 1).to(x.dtype)
+        x_after_pad = mask_tokens.clone()
+        x_after_pad[(1 - mask_with_buffer).nonzero(as_tuple=True)] = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+        # decoder position embedding
+        x = x_after_pad + self.decoder_pos_embed_learned
+        # apply Transformer blocks
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.decoder_blocks:
+                x = checkpoint(block, x)
+        else:
+            for block in self.decoder_blocks:
+                x = block(x)
+        x = self.decoder_norm(x)
+        x = x[:, self.buffer_size:]
+        x = x + self.diffusion_pos_embed_learned
+        return x
+    def gradient_checkpointing_enable(self):
+        self.grad_checkpointing = True
+    def gradient_checkpointing_disable(self):
+        self.grad_checkpointing = False

src/models/mar/diffloss.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+import math
+from src.models.mar.diffusion import create_diffusion
+class DiffLoss(nn.Module):
+    """Diffusion Loss"""
+    def __init__(self, target_channels, z_channels, depth, width, num_sampling_steps, grad_checkpointing=False):
+        super(DiffLoss, self).__init__()
+        self.in_channels = target_channels
+        self.net = SimpleMLPAdaLN(
+            in_channels=target_channels,
+            model_channels=width,
+            out_channels=target_channels * 2,  # for vlb loss
+            z_channels=z_channels,
+            num_res_blocks=depth,
+            grad_checkpointing=grad_checkpointing
+        )
+        self.train_diffusion = create_diffusion(timestep_respacing="", noise_schedule="cosine")
+        self.gen_diffusion = create_diffusion(timestep_respacing=num_sampling_steps, noise_schedule="cosine")
+    def forward(self, target, z, mask=None):
+        t = torch.randint(0, self.train_diffusion.num_timesteps, (target.shape[0],), device=target.device)
+        model_kwargs = dict(c=z)
+        loss_dict = self.train_diffusion.training_losses(self.net, target, t, model_kwargs)
+        loss = loss_dict["loss"]
+        if mask is not None:
+            loss = (loss * mask).sum() / mask.sum()
+        return loss.mean()
+    def sample(self, z, temperature=1.0, cfg=1.0):
+        # diffusion loss sampling
+        if not cfg == 1.0:
+            noise = torch.randn(z.shape[0] // 2, self.in_channels).cuda()
+            noise = torch.cat([noise, noise], dim=0)
+            model_kwargs = dict(c=z, cfg_scale=cfg)
+            sample_fn = self.net.forward_with_cfg
+        else:
+            noise = torch.randn(z.shape[0], self.in_channels).cuda()
+            model_kwargs = dict(c=z)
+            sample_fn = self.net.forward
+        sampled_token_latent = self.gen_diffusion.p_sample_loop(
+            sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=model_kwargs, progress=False,
+            temperature=temperature
+        )
+        return sampled_token_latent
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq.to(self.mlp[0].weight.data.dtype))
+        return t_emb
+class ResBlock(nn.Module):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    """
+    def __init__(
+        self,
+        channels
+    ):
+        super().__init__()
+        self.channels = channels
+        self.in_ln = nn.LayerNorm(channels, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, channels, bias=True),
+            nn.SiLU(),
+            nn.Linear(channels, channels, bias=True),
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(channels, 3 * channels, bias=True)
+        )
+    def forward(self, x, y):
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(y).chunk(3, dim=-1)
+        h = modulate(self.in_ln(x), shift_mlp, scale_mlp)
+        h = self.mlp(h)
+        return x + gate_mlp * h
+class FinalLayer(nn.Module):
+    """
+    The final layer adopted from DiT.
+    """
+    def __init__(self, model_channels, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(model_channels, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(model_channels, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(model_channels, 2 * model_channels, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class SimpleMLPAdaLN(nn.Module):
+    """
+    The MLP for Diffusion Loss.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param z_channels: channels in the condition.
+    :param num_res_blocks: number of residual blocks per downsample.
+    """
+    def __init__(
+        self,
+        in_channels,
+        model_channels,
+        out_channels,
+        z_channels,
+        num_res_blocks,
+        grad_checkpointing=False
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.grad_checkpointing = grad_checkpointing
+        self.time_embed = TimestepEmbedder(model_channels)
+        self.cond_embed = nn.Linear(z_channels, model_channels)
+        self.input_proj = nn.Linear(in_channels, model_channels)
+        res_blocks = []
+        for i in range(num_res_blocks):
+            res_blocks.append(ResBlock(
+                model_channels,
+            ))
+        self.res_blocks = nn.ModuleList(res_blocks)
+        self.final_layer = FinalLayer(model_channels, out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP
+        nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers
+        for block in self.res_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def forward(self, x, t, c):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C] Tensor of inputs.
+        :param t: a 1-D batch of timesteps.
+        :param c: conditioning from AR transformer.
+        :return: an [N x C] Tensor of outputs.
+        """
+        # import pdb; pdb.set_trace()
+        x = self.input_proj(x.to(self.input_proj.weight.data.dtype))
+        t = self.time_embed(t)
+        c = self.cond_embed(c.to(self.cond_embed.weight.data.dtype))
+        y = t + c
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.res_blocks:
+                x = checkpoint(block, x, y)
+        else:
+            for block in self.res_blocks:
+                x = block(x, y)
+        return self.final_layer(x, y)
+    def forward_with_cfg(self, x, t, c, cfg_scale):
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, c)
+        eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)

src/models/mar/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Adopted from DiT, which is modified from OpenAI's diffusion repos
+#     DiT: https://github.com/facebookresearch/DiT/diffusion
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+def create_diffusion(
+    timestep_respacing,
+    noise_schedule="highres_cosine",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    learn_sigma=True,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd.ModelVarType.FIXED_LARGE
+                if not sigma_small
+                else gd.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type
+        # rescale_timesteps=rescale_timesteps,
+    )

src/models/mar/diffusion/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import torch as th
+import numpy as np
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

src/models/mar/diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,884 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import math
+import numpy as np
+import torch as th
+import enum
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "cosine":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    elif schedule_name == "highres_cosine":
+        # Custom smoother cosine schedule for high-resolution diffusion
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.005) / 1.005 * math.pi / 2) ** 2,
+            max_beta=0.2,  # conservative to avoid over-noising at high-res
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.2):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        temperature=1.0
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param temperature: temperature scaling during Diff Loss sampling.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # scale the noise by temperature
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise * temperature
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        temperature=1.0,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :param temperature: temperature scaling during Diff Loss sampling.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            temperature=temperature,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        temperature=1.0,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape).cuda()
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0]).cuda()
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    temperature=temperature,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape).cuda()
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0]).cuda()
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

src/models/mar/diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

src/models/mar/engine_mar.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import src.models.mar.misc as misc
+import torch_fidelity
+import shutil
+import cv2
+import numpy as np
+import os
+import time
+def torch_evaluate(model, args):
+    model.eval()
+    num_steps = args.num_images // (args.batch_size * misc.get_world_size()) + 1
+    save_folder = os.path.join(args.output_dir, "ariter{}-temp{}-{}cfg{}-image{}".format(
+        args.num_iter, args.temperature, args.cfg_schedule, args.cfg, args.num_images))
+    print("Save to:", save_folder)
+    if misc.get_rank() == 0:
+        if not os.path.exists(save_folder):
+            os.makedirs(save_folder)
+    class_num = args.class_num
+    assert args.num_images % class_num == 0  # number of images per class must be the same
+    class_label_gen_world = np.arange(0, class_num).repeat(args.num_images // class_num)
+    class_label_gen_world = np.hstack([class_label_gen_world, np.zeros(50000)])
+    world_size = misc.get_world_size()
+    local_rank = misc.get_rank()
+    used_time = 0
+    gen_img_cnt = 0
+    for i in range(num_steps):
+        print("Generation step {}/{}".format(i, num_steps))
+        labels_gen = class_label_gen_world[world_size * args.batch_size * i + local_rank * args.batch_size:
+                                           world_size * args.batch_size * i + (local_rank + 1) * args.batch_size]
+        labels_gen = torch.Tensor(labels_gen).long().cuda()
+        torch.cuda.synchronize()
+        start_time = time.time()
+        # generation
+        with torch.no_grad():
+            with torch.cuda.amp.autocast():
+                # sampled_images = model.sample_official(bsz=args.batch_size, num_iter=args.num_iter, cfg=args.cfg,
+                #                                      cfg_schedule=args.cfg_schedule, labels=labels_gen,
+                #                                      temperature=args.temperature)
+                import pdb; pdb.set_trace()
+                if args.cfg != 1.0:
+                    labels_gen = torch.cat([
+                        labels_gen, torch.full_like(labels_gen, fill_value=-1)])
+                sampled_images = model.sample(labels_gen,
+                                              num_iter=args.num_iter, cfg=args.cfg, cfg_schedule=args.cfg_schedule,
+                                              temperature=args.temperature, progress=False)
+        # measure speed after the first generation batch
+        if i >= 1:
+            torch.cuda.synchronize()
+            used_time += time.time() - start_time
+            gen_img_cnt += args.batch_size
+            print("Generating {} images takes {:.5f} seconds, {:.5f} sec per image".format(gen_img_cnt, used_time, used_time / gen_img_cnt))
+        torch.distributed.barrier()
+        sampled_images = sampled_images.detach().cpu()
+        sampled_images = (sampled_images + 1) / 2
+        # distributed save
+        for b_id in range(sampled_images.size(0)):
+            img_id = i * sampled_images.size(0) * world_size + local_rank * sampled_images.size(0) + b_id
+            if img_id >= args.num_images:
+                break
+            gen_img = np.round(np.clip(sampled_images[b_id].numpy().transpose([1, 2, 0]) * 255, 0, 255))
+            gen_img = gen_img.astype(np.uint8)[:, :, ::-1]
+            cv2.imwrite(os.path.join(save_folder, '{}.png'.format(str(img_id).zfill(5))), gen_img)
+    torch.distributed.barrier()
+    time.sleep(10)
+    if misc.get_rank() == 0:
+        input2 = None
+        fid_statistics_file = 'fid_stats/adm_in256_stats.npz'
+        metrics_dict = torch_fidelity.calculate_metrics(
+            input1=save_folder,
+            input2=input2,
+            fid_statistics_file=fid_statistics_file,
+            cuda=True,
+            isc=True,
+            fid=True,
+            kid=False,
+            prc=False,
+            verbose=True,
+        )
+        fid = metrics_dict['frechet_inception_distance']
+        inception_score = metrics_dict['inception_score_mean']
+        print("FID: {:.4f}, Inception Score: {:.4f}".format(fid, inception_score))
+        # remove temporal saving folder
+        shutil.rmtree(save_folder)
+    torch.distributed.barrier()
+    time.sleep(10)

src/models/mar/mar.py ADDED Viewed

	@@ -0,0 +1,477 @@

+from functools import partial
+import numpy as np
+from tqdm import tqdm
+import scipy.stats as stats
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.utils.checkpoint import checkpoint
+from timm.models.vision_transformer import Block
+from .diffloss import DiffLoss
+def mask_by_order(mask_len, order, bsz, seq_len):
+    masking = torch.zeros(bsz, seq_len).to(order.device)
+    masking = torch.scatter(masking, dim=-1, index=order[:, :mask_len.long()],
+                            src=torch.ones(bsz, seq_len).to(order.device)).bool()
+    return masking
+class MAR(nn.Module):
+    """ Masked Autoencoder with VisionTransformer backbone
+    """
+    def __init__(self, img_size=256, vae_stride=16, patch_size=1,
+                 encoder_embed_dim=1024, encoder_depth=16, encoder_num_heads=16,
+                 decoder_embed_dim=1024, decoder_depth=16, decoder_num_heads=16,
+                 mlp_ratio=4., norm_layer=nn.LayerNorm,
+                 vae_embed_dim=16,
+                 mask_ratio_min=0.7,
+                 label_drop_prob=0.1,
+                 class_num=1000,
+                 attn_dropout=0.1,
+                 proj_dropout=0.1,
+                 buffer_size=64,
+                 diffloss_d=3,
+                 diffloss_w=1024,
+                 num_sampling_steps='100',
+                 diffusion_batch_mul=4,
+                 grad_checkpointing=False,
+                 ):
+        super().__init__()
+        # --------------------------------------------------------------------------
+        # VAE and patchify specifics
+        self.vae_embed_dim = vae_embed_dim
+        self.img_size = img_size
+        self.vae_stride = vae_stride
+        self.patch_size = patch_size
+        self.seq_h = self.seq_w = img_size // vae_stride // patch_size
+        self.seq_len = self.seq_h * self.seq_w
+        self.token_embed_dim = vae_embed_dim * patch_size**2
+        self.grad_checkpointing = grad_checkpointing
+        # --------------------------------------------------------------------------
+        # Class Embedding
+        self.num_classes = class_num
+        self.class_emb = nn.Embedding(class_num, encoder_embed_dim)
+        self.label_drop_prob = label_drop_prob
+        # Fake class embedding for CFG's unconditional generation
+        self.fake_latent = nn.Parameter(torch.zeros(1, encoder_embed_dim))
+        # --------------------------------------------------------------------------
+        # MAR variant masking ratio, a left-half truncated Gaussian centered at 100% masking ratio with std 0.25
+        self.mask_ratio_generator = stats.truncnorm((mask_ratio_min - 1.0) / 0.25, 0, loc=1.0, scale=0.25)
+        # --------------------------------------------------------------------------
+        # MAR encoder specifics
+        self.encoder_embed_dim = encoder_embed_dim
+        self.z_proj = nn.Linear(self.token_embed_dim, encoder_embed_dim, bias=True)
+        self.z_proj_ln = nn.LayerNorm(encoder_embed_dim, eps=1e-6)
+        self.buffer_size = buffer_size
+        self.encoder_pos_embed_learned = nn.Parameter(torch.zeros(1, self.seq_len + self.buffer_size, encoder_embed_dim))
+        self.encoder_blocks = nn.ModuleList([
+            Block(encoder_embed_dim, encoder_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer,
+                  proj_drop=proj_dropout, attn_drop=attn_dropout) for _ in range(encoder_depth)])
+        self.encoder_norm = norm_layer(encoder_embed_dim)
+        # --------------------------------------------------------------------------
+        # MAR decoder specifics
+        self.decoder_embed_dim = decoder_embed_dim
+        self.decoder_embed = nn.Linear(encoder_embed_dim, decoder_embed_dim, bias=True)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        self.decoder_pos_embed_learned = nn.Parameter(torch.zeros(1, self.seq_len + self.buffer_size, decoder_embed_dim))
+        self.decoder_blocks = nn.ModuleList([
+            Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer,
+                  proj_drop=proj_dropout, attn_drop=attn_dropout) for _ in range(decoder_depth)])
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        self.diffusion_pos_embed_learned = nn.Parameter(torch.zeros(1, self.seq_len, decoder_embed_dim))
+        self.initialize_weights()
+        # --------------------------------------------------------------------------
+        # Diffusion Loss
+        self.diffloss = DiffLoss(
+            target_channels=self.token_embed_dim,
+            z_channels=decoder_embed_dim,
+            width=diffloss_w,
+            depth=diffloss_d,
+            num_sampling_steps=num_sampling_steps,
+            grad_checkpointing=self.grad_checkpointing
+        )
+        self.diffusion_batch_mul = diffusion_batch_mul
+    def get_encoder_pos_embed(self, h, w):
+        if h == self.seq_h and w == self.seq_w:
+            return self.encoder_pos_embed_learned
+        buffer_pe, image_pe = self.encoder_pos_embed_learned.split(
+            [self.buffer_size, self.seq_len], dim=1)
+        image_pe = rearrange(image_pe, 'b (h w) c -> b c h w',
+                             h=self.seq_h, w=self.seq_w)
+        image_pe = F.interpolate(image_pe, size=(h, w), mode='bilinear')
+        image_pe = rearrange(image_pe, 'b c h w -> b (h w) c')
+        return torch.cat([buffer_pe, image_pe], dim=1)
+    def get_decoder_pos_embed(self, h, w):
+        if h == self.seq_h and w == self.seq_w:
+            return self.decoder_pos_embed_learned
+        buffer_pe, image_pe = self.decoder_pos_embed_learned.split(
+            [self.buffer_size, self.seq_len], dim=1)
+        image_pe = rearrange(image_pe, 'b (h w) c -> b c h w',
+                             h=self.seq_h, w=self.seq_w)
+        image_pe = F.interpolate(image_pe, size=(h, w), mode='bilinear')
+        image_pe = rearrange(image_pe, 'b c h w -> b (h w) c')
+        return torch.cat([buffer_pe, image_pe], dim=1)
+    def get_diffusion_pos_embed(self, h, w):
+        if h == self.seq_h and w == self.seq_w:
+            return self.diffusion_pos_embed_learned
+        image_pe = self.diffusion_pos_embed_learned
+        image_pe = rearrange(image_pe, 'b (h w) c -> b c h w',
+                             h=self.seq_h, w=self.seq_w)
+        image_pe = F.interpolate(image_pe, size=(h, w), mode='bilinear')
+        image_pe = rearrange(image_pe, 'b c h w -> b (h w) c')
+        return image_pe
+    def initialize_weights(self):
+        # parameters
+        torch.nn.init.normal_(self.class_emb.weight, std=.02)
+        torch.nn.init.normal_(self.fake_latent, std=.02)
+        torch.nn.init.normal_(self.mask_token, std=.02)
+        torch.nn.init.normal_(self.encoder_pos_embed_learned, std=.02)
+        torch.nn.init.normal_(self.decoder_pos_embed_learned, std=.02)
+        torch.nn.init.normal_(self.diffusion_pos_embed_learned, std=.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+            if m.weight is not None:
+                nn.init.constant_(m.weight, 1.0)
+    @property
+    def device(self):
+        return self.fake_latent.data.device
+    @property
+    def dtype(self):
+        return self.fake_latent.data.dtype
+    def patchify(self, x):
+        bsz, c, h, w = x.shape
+        p = self.patch_size
+        h_, w_ = h // p, w // p
+        x = x.reshape(bsz, c, h_, p, w_, p)
+        x = torch.einsum('nchpwq->nhwcpq', x)
+        x = x.reshape(bsz, h_ * w_, c * p ** 2)
+        return x  # [n, l, d]
+    def unpatchify(self, x):
+        bsz = x.shape[0]
+        p = self.patch_size
+        c = self.vae_embed_dim
+        h_, w_ = self.seq_h, self.seq_w
+        x = x.reshape(bsz, h_, w_, c, p, p)
+        x = torch.einsum('nhwcpq->nchpwq', x)
+        x = x.reshape(bsz, c, h_ * p, w_ * p)
+        return x  # [n, c, h, w]
+    def sample_orders(self, bsz, seq_len=None):
+        if seq_len is None:
+            seq_len = self.seq_len
+        # generate a batch of random generation orders
+        orders = []
+        for _ in range(bsz):
+            order = np.array(list(range(seq_len)))
+            np.random.shuffle(order)
+            orders.append(order)
+        orders = torch.Tensor(np.array(orders)).to(self.device).long()
+        return orders
+    def random_masking(self, x, orders):
+        # generate token mask
+        bsz, seq_len, embed_dim = x.shape
+        assert seq_len == orders.shape[1]
+        mask_rate = self.mask_ratio_generator.rvs(1)[0]
+        num_masked_tokens = int(np.ceil(seq_len * mask_rate))
+        mask = torch.zeros(bsz, seq_len, device=x.device)
+        mask = torch.scatter(mask, dim=-1, index=orders[:, :num_masked_tokens],
+                             src=torch.ones(bsz, seq_len, device=x.device))
+        return mask
+    def forward_mae_encoder(self, x, mask, class_embedding, image_shape=None):
+        x = x.to(self.dtype)
+        x = self.z_proj(x)
+        bsz, seq_len, embed_dim = x.shape
+        # concat buffer
+        x = torch.cat([x.new_zeros(bsz, self.buffer_size, embed_dim), x], dim=1)
+        mask_with_buffer = torch.cat([mask.new_zeros(x.size(0), self.buffer_size), mask], dim=1)
+        # random drop class embedding during training
+        # if self.training:
+        #     drop_latent_mask = torch.rand(bsz) < self.label_drop_prob
+        #     drop_latent_mask = drop_latent_mask.unsqueeze(-1).to(self.device).to(x.dtype)
+        #     class_embedding = drop_latent_mask * self.fake_latent + (1 - drop_latent_mask) * class_embedding
+        x[:, :self.buffer_size] = class_embedding.view(bsz, -1, embed_dim)
+        # encoder position embedding
+        # x = x + self.encoder_pos_embed_learned
+        if image_shape is None:
+            x = x + self.encoder_pos_embed_learned
+        else:
+            h, w = image_shape
+            assert h * w == seq_len
+            x = x + self.get_encoder_pos_embed(h=h, w=w)
+        # import pdb; pdb.set_trace()
+        x = self.z_proj_ln(x)
+        # dropping
+        x = x[(1-mask_with_buffer).nonzero(as_tuple=True)].reshape(bsz, -1, embed_dim)
+        # apply Transformer blocks
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.encoder_blocks:
+                x = checkpoint(block, x,
+                               use_reentrant=False
+                               )
+        else:
+            for block in self.encoder_blocks:
+                x = block(x)
+        x = self.encoder_norm(x)
+        return x
+    def forward_mae_decoder(self, x, mask, image_shape=None, x_con=None):
+        bsz, seq_len = mask.shape
+        x = self.decoder_embed(x)
+        mask_with_buffer = torch.cat([torch.zeros(x.size(0), self.buffer_size, device=x.device), mask], dim=1)
+        # pad mask tokens
+        mask_tokens = self.mask_token.repeat(mask_with_buffer.shape[0], mask_with_buffer.shape[1], 1).to(x.dtype)
+        if x_con is not None:
+            x_after_pad = self.decoder_embed(x_con)
+        else:
+            x_after_pad = mask_tokens.clone()
+        x_after_pad[(1 - mask_with_buffer).nonzero(as_tuple=True)] = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+        # decoder position embedding
+        # x = x_after_pad + self.decoder_pos_embed_learned
+        if image_shape is None:
+            x = x_after_pad + self.decoder_pos_embed_learned
+        else:
+            h, w = image_shape
+            assert h * w == seq_len
+            x = x_after_pad + self.get_decoder_pos_embed(h=h, w=w)
+        # apply Transformer blocks
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.decoder_blocks:
+                x = checkpoint(block, x,
+                               # use_reentrant=False
+                               )
+        else:
+            for block in self.decoder_blocks:
+                x = block(x)
+        x = self.decoder_norm(x)
+        x = x[:, self.buffer_size:]
+        # x = x + self.diffusion_pos_embed_learned
+        if image_shape is None:
+            x = x + self.diffusion_pos_embed_learned
+        else:
+            h, w = image_shape
+            assert h * w == seq_len
+            x = x + self.get_diffusion_pos_embed(h=h, w=w)
+        return x
+    def mae_decoder_prepare(self, x, mask):
+        x = self.decoder_embed(x)
+        mask_with_buffer = torch.cat([torch.zeros(x.size(0), self.buffer_size, device=x.device), mask], dim=1)
+        # pad mask tokens
+        mask_tokens = self.mask_token.repeat(mask_with_buffer.shape[0], mask_with_buffer.shape[1], 1).to(x.dtype)
+        x_after_pad = mask_tokens.clone()
+        x_after_pad[(1 - mask_with_buffer).nonzero(as_tuple=True)] = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+        # decoder position embedding
+        x = x_after_pad + self.decoder_pos_embed_learned
+        return x
+    def mae_decoder_forward(self, x):
+        # apply Transformer blocks
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.decoder_blocks:
+                x = checkpoint(block, x,
+                               # use_reentrant=False
+                               )
+        else:
+            for block in self.decoder_blocks:
+                x = block(x)
+        x = self.decoder_norm(x)
+        x = x[:, self.buffer_size:]
+        x = x + self.diffusion_pos_embed_learned
+        return x
+    def forward_loss(self, z, target, mask):
+        bsz, seq_len, _ = target.shape
+        target = target.reshape(bsz * seq_len, -1).repeat(self.diffusion_batch_mul, 1)
+        z = z.reshape(bsz*seq_len, -1).repeat(self.diffusion_batch_mul, 1)
+        mask = mask.reshape(bsz*seq_len).repeat(self.diffusion_batch_mul)
+        loss = self.diffloss(z=z, target=target, mask=mask)
+        return loss
+    def forward(self, imgs, labels):
+        # class embed
+        class_embedding = self.class_emb(labels)
+        # patchify and mask (drop) tokens
+        x = self.patchify(imgs)
+        gt_latents = x.clone().detach()
+        orders = self.sample_orders(bsz=x.size(0))
+        mask = self.random_masking(x, orders)
+        # mae encoder
+        x = self.forward_mae_encoder(x, mask, class_embedding)
+        # mae decoder
+        z = self.forward_mae_decoder(x, mask)
+        # diffloss
+        loss = self.forward_loss(z=z, target=gt_latents, mask=mask)
+        return loss
+    def sample_tokens(self, bsz, num_iter=64, cfg=1.0, cfg_schedule="linear", labels=None, temperature=1.0, progress=False):
+        import pdb; pdb.set_trace()
+        # init and sample generation orders
+        mask = torch.ones(bsz, self.seq_len).to(self.device)
+        tokens = torch.zeros(bsz, self.seq_len, self.token_embed_dim).to(self.device)
+        orders = self.sample_orders(bsz)
+        indices = list(range(num_iter))
+        if progress:
+            indices = tqdm(indices)
+        # generate latents
+        for step in indices:
+            cur_tokens = tokens.clone()
+            # class embedding and CFG
+            if labels is not None:
+                class_embedding = self.class_emb(labels)
+            else:
+                class_embedding = self.fake_latent.repeat(bsz, 1)
+            if not cfg == 1.0:
+                tokens = torch.cat([tokens, tokens], dim=0)
+                class_embedding = torch.cat([class_embedding, self.fake_latent.repeat(bsz, 1)], dim=0)
+                mask = torch.cat([mask, mask], dim=0)
+            # mae encoder
+            x = self.forward_mae_encoder(tokens, mask.to(self.dtype), class_embedding)
+            # mae decoder
+            z = self.forward_mae_decoder(x, mask.to(self.dtype))
+            import pdb; pdb.set_trace()
+            # mask ratio for the next round, following MaskGIT and MAGE.
+            mask_ratio = np.cos(math.pi / 2. * (step + 1) / num_iter)
+            mask_len = torch.Tensor([np.floor(self.seq_len * mask_ratio)]).to(self.device)
+            import pdb; pdb.set_trace()
+            # masks out at least one for the next iteration
+            mask_len = torch.maximum(torch.Tensor([1]).to(self.device),
+                                     torch.minimum(torch.sum(mask, dim=-1, keepdims=True) - 1, mask_len))
+            import pdb; pdb.set_trace()
+            # get masking for next iteration and locations to be predicted in this iteration
+            mask_next = mask_by_order(mask_len[0], orders, bsz, self.seq_len)
+            import pdb; pdb.set_trace()
+            if step >= num_iter - 1:
+                mask_to_pred = mask[:bsz].bool()
+            else:
+                mask_to_pred = torch.logical_xor(mask[:bsz].bool(), mask_next.bool())
+            mask = mask_next
+            if not cfg == 1.0:
+                mask_to_pred = torch.cat([mask_to_pred, mask_to_pred], dim=0)
+            import pdb; pdb.set_trace()
+            # sample token latents for this step
+            z = z[mask_to_pred.nonzero(as_tuple=True)]
+            # cfg schedule follow Muse
+            if cfg_schedule == "linear":
+                cfg_iter = 1 + (cfg - 1) * (self.seq_len - mask_len[0]) / self.seq_len
+            elif cfg_schedule == "constant":
+                cfg_iter = cfg
+            else:
+                raise NotImplementedError
+            sampled_token_latent = self.diffloss.sample(z, temperature, cfg_iter)
+            if not cfg == 1.0:
+                sampled_token_latent, _ = sampled_token_latent.chunk(2, dim=0)  # Remove null class samples
+                mask_to_pred, _ = mask_to_pred.chunk(2, dim=0)
+            import pdb; pdb.set_trace()
+            cur_tokens[mask_to_pred.nonzero(as_tuple=True)] = sampled_token_latent
+            tokens = cur_tokens.clone()
+        # unpatchify
+        tokens = self.unpatchify(tokens)
+        return tokens
+    def gradient_checkpointing_enable(self):
+        self.grad_checkpointing = True
+    def gradient_checkpointing_disable(self):
+        self.grad_checkpointing = False
+def mar_base(**kwargs):
+    model = MAR(
+        encoder_embed_dim=768, encoder_depth=12, encoder_num_heads=12,
+        decoder_embed_dim=768, decoder_depth=12, decoder_num_heads=12,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mar_large(**kwargs):
+    model = MAR(
+        encoder_embed_dim=1024, encoder_depth=16, encoder_num_heads=16,
+        decoder_embed_dim=1024, decoder_depth=16, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mar_huge(**kwargs):
+    model = MAR(
+        encoder_embed_dim=1280, encoder_depth=20, encoder_num_heads=16,
+        decoder_embed_dim=1280, decoder_depth=20, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mar_max(**kwargs):
+    model = MAR(
+        encoder_embed_dim=1536, encoder_depth=24, encoder_num_heads=16,
+        decoder_embed_dim=1536, decoder_depth=24, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model

src/models/mar/misc.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import builtins
+import datetime
+import os
+import time
+from collections import defaultdict, deque
+from pathlib import Path
+import torch
+import torch.distributed as dist
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
+    from torch._six import inf
+else:
+    from torch import inf
+import copy
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print('[{}] '.format(now), end='')  # print with time stamp
+            builtin_print(*args, **kwargs)
+    builtins.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if args.dist_on_itp:
+        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['RANK'] = str(args.rank)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+    def state_dict(self):
+        return self._scaler.state_dict()
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+    return total_norm
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list or 'diffloss' in name:
+            no_decay.append(param)  # no weight decay on bias, norm and diffloss
+        else:
+            decay.append(param)
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]
+def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, ema_params=None, epoch_name=None):
+    if epoch_name is None:
+        epoch_name = str(epoch)
+    output_dir = Path(args.output_dir)
+    checkpoint_path = output_dir / ('checkpoint-%s.pth' % epoch_name)
+    # ema
+    if ema_params is not None:
+        ema_state_dict = copy.deepcopy(model_without_ddp.state_dict())
+        for i, (name, _value) in enumerate(model_without_ddp.named_parameters()):
+            assert name in ema_state_dict
+            ema_state_dict[name] = ema_params[i]
+    else:
+        ema_state_dict = None
+    to_save = {
+        'model': model_without_ddp.state_dict(),
+        'model_ema': ema_state_dict,
+        'optimizer': optimizer.state_dict(),
+        'epoch': epoch,
+        'scaler': loss_scaler.state_dict(),
+        'args': args,
+    }
+    save_on_master(to_save, checkpoint_path)
+def all_reduce_mean(x):
+    world_size = get_world_size()
+    if world_size > 1:
+        x_reduce = torch.tensor(x).cuda()
+        dist.all_reduce(x_reduce)
+        x_reduce /= world_size
+        return x_reduce.item()
+    else:
+        return x

src/models/mar/vae.py ADDED Viewed

	@@ -0,0 +1,525 @@

+# Adopted from LDM's KL-VAE: https://github.com/CompVis/latent-diffusion
+import torch
+import torch.nn as nn
+import numpy as np
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch=128,
+        out_ch=3,
+        ch_mult=(1, 1, 2, 2, 4),
+        num_res_blocks=2,
+        attn_resolutions=(16,),
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels=3,
+        resolution=256,
+        z_channels=16,
+        double_z=True,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x):
+        # assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch=128,
+        out_ch=3,
+        ch_mult=(1, 1, 2, 2, 4),
+        num_res_blocks=2,
+        attn_resolutions=(),
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels=3,
+        resolution=256,
+        z_channels=16,
+        give_pre_end=False,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        print(
+            "Working with z of shape {} = {} dimensions.".format(
+                self.z_shape, np.prod(self.z_shape)
+            )
+        )
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(
+                device=self.parameters.device
+            )
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(
+            device=self.parameters.device
+        )
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )
+    def nll(self, sample, dims=[1, 2, 3]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self):
+        return self.mean
+class AutoencoderKL(nn.Module):
+    def __init__(self, embed_dim, ch_mult, use_variational=True, ckpt_path=None):
+        super().__init__()
+        self.encoder = Encoder(ch_mult=ch_mult, z_channels=embed_dim)
+        self.decoder = Decoder(ch_mult=ch_mult, z_channels=embed_dim)
+        self.use_variational = use_variational
+        mult = 2 if self.use_variational else 1
+        self.quant_conv = torch.nn.Conv2d(2 * embed_dim, mult * embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, embed_dim, 1)
+        self.embed_dim = embed_dim
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path)
+    def init_from_ckpt(self, path):
+        sd = torch.load(path, map_location="cpu")["model"]
+        msg = self.load_state_dict(sd, strict=False)
+        print("Loading pre-trained KL-VAE")
+        print("Missing keys:")
+        print(msg.missing_keys)
+        print("Unexpected keys:")
+        print(msg.unexpected_keys)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        if not self.use_variational:
+            moments = torch.cat((moments, torch.ones_like(moments)), 1)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, inputs, disable=True, train=True, optimizer_idx=0):
+        if train:
+            return self.training_step(inputs, disable, optimizer_idx)
+        else:
+            return self.validation_step(inputs, disable)
+if __name__ == "__main__":
+    from PIL import Image
+    import numpy as np
+    import torch.nn.functional as F
+    vae = AutoencoderKL(
+        embed_dim=16, ch_mult=(1, 1, 2, 2, 4),
+        ckpt_path='checkpoints/kl16.ckpt')
+    image = Image.open('data/ILSVRC2012_val_00023344.JPEG')
+    image = torch.from_numpy(np.array(image))
+    image = image.permute(2, 0, 1).float() / 255
+    image = 2 * image - 1
+    x = F.interpolate(image[None], size=(256, 256), mode='bilinear', align_corners=True)
+    print(x.shape)
+    with torch.no_grad():
+        z = vae.encode(x).sample()
+        print(z.shape)
+        x_rec = vae.decode(z)[0]
+    x_rec = (x_rec + 1.0) * 255 / 2
+    x_rec = torch.clamp(x_rec, min=0, max=255)
+    x_rec = x_rec.to(torch.uint8)
+    x_rec = x_rec.permute(1, 2, 0)
+    x_rec = Image.fromarray(x_rec.numpy())
+    x_rec.show()

src/models/skywork_unipic_dev.py ADDED Viewed

	@@ -0,0 +1,645 @@

+import torch
+import torch.nn.functional as F
+from torch.nn.modules.module import T
+from mmengine.model import BaseModel
+from torch.autograd.function import Function
+from mmengine.logging import print_log
+from xtuner.model.utils import guess_load_checkpoint
+import os
+#from .skywork_unipic import SkyworkUnipic
+from .skywork_unipic_siglip import SkyworkUnipic
+from xtuner.utils import IMAGE_TOKEN_INDEX
+import torch.distributed as dist
+import json
+from einops import rearrange
+def _load_state_dict_with_ds(module_to_load, state_dict, start_prefix="", strict=True):
+    try:
+        import deepspeed
+    except ImportError:
+        raise ImportError("deepspeed is not installed. Please install deepspeed to use this feature.")
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, "_metadata", None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+    error_msgs = []
+    missing_keys = []
+    unexpected_keys = []
+    def load(module: torch.nn.Module, state_dict, prefix=""):
+        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+        args = (state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+        # Parameters of module and children will start with prefix. We can exit early if there are none in this
+        # state_dict
+        if len([key for key in state_dict if key.startswith(prefix)]) > 0:
+                # In sharded models, each shard has only part of the full state_dict, so only gather
+                # parameters that are in the current state_dict.
+                named_parameters = dict(
+                    module.named_parameters(prefix=prefix[:-1], recurse=False)
+                )
+                params_to_gather = [
+                    named_parameters[k]
+                    for k in state_dict.keys()
+                    if k in named_parameters
+                ]
+                if len(params_to_gather) > 0:
+                    # because zero3 puts placeholders in model params, this context
+                    # manager gathers (unpartitions) the params of the current layer, then loads from
+                    # the state dict and then re-partitions them again
+                    with deepspeed.zero.GatheredParameters(
+                        params_to_gather, modifier_rank=0
+                    ):
+                        if deepspeed.comm.get_rank() == 0:
+                            module._load_from_state_dict(*args)
+        else:
+            module._load_from_state_dict(*args)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, state_dict, prefix + name + ".")
+    load(module_to_load, state_dict, start_prefix)
+    if len(missing_keys) > 0:
+        print_log(f"[WARNING] Missing keys: {missing_keys}")
+    if len(unexpected_keys) > 0:
+        print_log(f"[WARNING] Unexpected keys: {unexpected_keys}")
+    if error_msgs:
+        raise RuntimeError(
+            "Error(s) in loading state_dict for {}:\n\t{}".format(
+                module_to_load.__class__.__name__, "\n\t".join(error_msgs)
+            )
+        )
+class _ScaleGradient(Function):
+    @staticmethod
+    def forward(ctx, input, scale):
+        ctx.scale = scale
+        return input
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output * ctx.scale, None
+class SkyworkUnipicDev(SkyworkUnipic, BaseModel):
+    def __init__(
+        self,
+        grad_scale=0.1,
+        loss_weights=None,
+        pretrained_pth=None,
+        mar_path=None,
+        siglip_proj_path=None,
+        freeze_llm=False,
+        freeze_mar=False,
+        freeze_mar_decoder=False,
+        freeze_siglip_proj=False,
+        gradient_checkpointing=True,
+        **kwargs,
+    ):
+        if loss_weights is None:
+            loss_weights = {
+                "image2text": 0.01,
+                "text2image": 1.0,
+                "image_edit": 1.0,
+                "contrastive": 0.1,
+            }
+        super().__init__(**kwargs)
+        self.grad_scale = grad_scale
+        self.loss_weights = loss_weights
+        self.pretrained_pth = pretrained_pth
+        self.mar_path = mar_path
+        self.siglip_proj_path = siglip_proj_path
+        # 判断分布式 rank
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        # === 加载预训练权重 ===
+        if pretrained_pth:
+            self.load_hf_weights(
+                skywork_unipic_ckpt=pretrained_pth,
+                siglip_proj_path=siglip_proj_path,
+                mar_path=mar_path
+            )
+        # === 冻结模块 ===
+        if freeze_llm:
+            self.llm.requires_grad_(False)
+        if freeze_mar:
+            self.mar.requires_grad_(False)
+        if freeze_mar_decoder:
+            # 仅冻结 MAR 解码器部件
+            for param in self.mar.decoder_embed.parameters():
+                param.requires_grad = False
+            for block in self.mar.decoder_blocks:
+                for param in block.parameters():
+                    param.requires_grad = False
+            for param in self.mar.decoder_norm.parameters():
+                param.requires_grad = False
+            if isinstance(self.mar.decoder_pos_embed_learned, torch.nn.Parameter):
+                self.mar.decoder_pos_embed_learned.requires_grad = False
+            if isinstance(self.mar.diffusion_pos_embed_learned, torch.nn.Parameter):
+                self.mar.diffusion_pos_embed_learned.requires_grad = False
+        if freeze_siglip_proj:
+            self.siglip2_proj.requires_grad_(False)
+        # === 梯度检查点 ===
+        if gradient_checkpointing:
+            self.gradient_checkpointing_enable()
+        else:
+            self.gradient_checkpointing_disable()
+    def load_hf_weights(self,
+                        skywork_unipic_ckpt: str = None,
+                        siglip_proj_path: str = None,
+                        mar_path: str = None):
+        """统一加载 SkyworkUnipic（可选） + SigLIP2 + MAR"""
+        device = "cpu"
+        state_dict = {}
+        def _print_load_result(module_name, missing, unexpected):
+            print_log(f"[INFO] Loaded {module_name}. missing={len(missing)}, unexpected={len(unexpected)}")
+        # === SkyworkUnipic 主模型（可选） ===
+        if skywork_unipic_ckpt:
+            print_log(f"[INFO] Loading SkyworkUnipic checkpoint from: {skywork_unipic_ckpt}")
+            # 加载 checkpoint（支持文件或目录）
+            if os.path.isfile(skywork_unipic_ckpt):
+                skywork_unipic_state = torch.load(skywork_unipic_ckpt, map_location=device)
+            else:
+                idx = os.path.join(skywork_unipic_ckpt, "pytorch_model.bin.index.json")
+                if os.path.exists(idx):
+                    with open(idx, 'r') as f:
+                        index = json.load(f)
+                    skywork_unipic_state = {}
+                    for shard in sorted(set(index["weight_map"].values())):
+                        shard_path = os.path.join(skywork_unipic_ckpt, shard)
+                        skywork_unipic_state.update(torch.load(shard_path, map_location=device))
+                else:
+                    bin_path = os.path.join(skywork_unipic_ckpt, "pytorch_model.bin")
+                    skywork_unipic_state = torch.load(bin_path, map_location=device)
+            # 删除 SkyworkUnipic checkpoint 中可能带的 MAR pos_embed，避免覆盖
+            # for key in [
+            #     "mar.encoder_pos_embed_learned",
+            #     "mar.decoder_pos_embed_learned",
+            #     "mar.diffusion_pos_embed_learned"
+            # ]:
+            #     if key in skywork_unipic_state:
+            #         print_log(f"[INFO] Dropping `{key}` from SkyworkUnipic checkpoint")
+            #         del skywork_unipic_state[key]
+            model_dict = self.state_dict()
+            filtered_checkpoint = {}
+            shape_mismatch_keys = []
+            for k, v in skywork_unipic_state.items():
+                if k in model_dict:
+                    if v.shape == model_dict[k].shape:
+                        filtered_checkpoint[k] = v
+                    else:
+                        shape_mismatch_keys.append((k, v.shape, model_dict[k].shape))
+            missing, unexpected = self.load_state_dict(filtered_checkpoint, strict=False)
+            # 打印不匹配的 key 及其形状
+            if shape_mismatch_keys:
+                print("以下 key 因形状不匹配被跳过：")
+                for k, checkpoint_shape, model_shape in shape_mismatch_keys:
+                    print(f"  - {k}:")
+                    print(f"    checkpoint 中的形状: {checkpoint_shape}")
+                    print(f"    当前模型的形状: {model_shape}")
+            else:
+                print("所有 key 形状匹配，未跳过任何参数")
+            # missing, unexpected = self.load_state_dict(skywork_unipic_state, strict=False)
+            _print_load_result("SkyworkUnipic", missing, unexpected)
+        else:
+            print_log("[INFO] Skipping SkyworkUnipic checkpoint loading")
+        # === SigLIP2 权重 ===
+        if siglip_proj_path:
+            print_log(f"[INFO] Loading SigLIP2 weights from: {siglip_proj_path}")
+            siglip_state = torch.load(
+                siglip_proj_path, map_location="cpu", weights_only=False
+            )
+            # 如果 checkpoint 是 {"model": {...}}
+            if isinstance(siglip_state, dict) and "model" in siglip_state:
+                siglip_state = siglip_state["model"]
+            missing, unexpected = self.siglip2_proj.load_state_dict(
+                siglip_state, strict=False
+            )
+            _print_load_result("SigLIP2", missing, unexpected)
+        else:
+            print_log("[INFO] No SigLIP2 checkpoint provided, skipping")
+        # === MAR 权重 ===
+        if mar_path:
+            print_log(f"[INFO] Loading MAR weights from: {mar_path}")
+            mar_state = torch.load(mar_path, map_location="cpu", weights_only=False)
+            # 兼容 model_ema or model dict
+            if isinstance(mar_state, dict) and "model_ema" in mar_state:
+                mar_state = mar_state["model_ema"]
+            elif isinstance(mar_state, dict) and "model" in mar_state:
+                mar_state = mar_state["model"]
+            # 如果 key 带有 “mar.” 前缀，批量去掉
+            if any(k.startswith("mar.") for k in mar_state):
+                filtered_mar = {
+                    k.replace("mar.", "", 1): v
+                    for k, v in mar_state.items()
+                    if k.startswith("mar.")
+                }
+            else:
+                filtered_mar = mar_state
+            missing, unexpected = self.mar.load_state_dict(
+                filtered_mar, strict=False
+            )
+            _print_load_result("MAR", missing, unexpected)
+        else:
+            print_log("[INFO] No MAR checkpoint provided, skipping")
+        return state_dict
+    def gradient_checkpointing_disable(self):
+        self.llm.gradient_checkpointing_disable()
+        self.mar.gradient_checkpointing_disable()
+    def gradient_checkpointing_enable(self):
+        self.llm.gradient_checkpointing_enable()
+        self.mar.gradient_checkpointing_enable()
+    def state_dict(self, *args, **kwargs):
+        state_dict = super().state_dict(*args, **kwargs)
+        state_dict = {k: v for k, v in state_dict.items()
+                      if 'vae.' not in k}
+        return state_dict
+    def train(self: T, mode: bool = True) -> T:
+        super().train(mode=mode)
+        self.vae.train(mode=False)
+        return self
+    def text2image_loss(self, data_dict):
+        x = data_dict['pixel_values'].to(dtype=self.dtype, device=self.device)
+        x = self.encode(x)   # b m n c
+        b, m, n, _ = x.shape
+        gt_latents = x.clone().detach().view(b, m*n, -1)
+        orders = self.mar.sample_orders(bsz=b, seq_len=m*n)
+        mask = self.mar.random_masking(x.flatten(1, 2), orders)
+        input_ids = data_dict['input_ids'].to(self.device)
+        attention_mask = data_dict['attention_mask'].to(self.device)
+        x_enc = self.forward_mae_encoder(x, mask, input_ids=input_ids,
+                                         attention_mask=attention_mask)
+        z = self.mar.forward_mae_decoder(x_enc, mask, image_shape=(m, n))
+        loss = self.mar.forward_loss(z=z, target=gt_latents, mask=mask)
+        return loss
+    def image2text_loss(self, data_dict):
+        input_ids = data_dict['input_ids'].to(self.device)
+        attention_mask = data_dict['attention_mask'].to(self.device)
+        labels = data_dict['labels'].to(self.device)
+        pixel_values = data_dict.get('pixel_values', None)
+        # print("pixel_values batch:", pixel_values.shape)
+        # print("input_ids batch:", input_ids.shape)
+        if pixel_values is None:
+            inputs_embeds = self.llm.get_input_embeddings()(input_ids)
+            _, z_null = self.extract_visual_feature(
+                torch.zeros(1, 16, 16, self.token_embed_dim,
+                            dtype=self.dtype, device=self.device)
+            )
+            loss_null = z_null.mean() * 0.0
+            print(f"No image found in this batch!", flush=True)
+        else:
+            x = pixel_values.to(dtype=self.dtype, device=self.device)
+            x = self.encode(x)  # b m n c
+            _, z_enc = self.extract_visual_feature(x)
+            if self.grad_scale is not None:
+                z_enc = _ScaleGradient.apply(z_enc, self.grad_scale)
+            inputs_embeds = z_enc.new_zeros(*input_ids.shape, self.llm.config.hidden_size)
+            self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+            IMAGE_TOKEN_INDEX = self.tokenizer.convert_tokens_to_ids("<image>")
+            # print(f"IMAGE_TOKEN_INDEX: {IMAGE_TOKEN_INDEX}")
+            img_tokens = (torch.tensor(input_ids) == IMAGE_TOKEN_INDEX).sum().item()
+            # print(f"[校验日志] input_ids长度: {len('input_ids')}, 图像token出现次数: {img_tokens}\n")
+            inputs_embeds[input_ids == IMAGE_TOKEN_INDEX] = z_enc.flatten(0, 1)
+            inputs_embeds[input_ids != IMAGE_TOKEN_INDEX] = self.llm.get_input_embeddings()(
+                input_ids[input_ids != IMAGE_TOKEN_INDEX])
+            loss_null = 0.0
+        output = self.llm_model(inputs_embeds=inputs_embeds,
+                                attention_mask=attention_mask,
+                                return_dict=True)
+        last_hidden_state = output.last_hidden_state[:, :-1]
+        labels = labels[:, 1:]
+        last_hidden_state = last_hidden_state[labels >= 0]
+        labels = labels[labels >= 0]
+        logits = self.llm.get_output_embeddings()(last_hidden_state)
+        loss_i2t = F.cross_entropy(input=logits, target=labels)
+        return loss_i2t + loss_null
+    # def image_edit_loss(self, data_dict):
+    #     # 1. 图像前向：拼 batch 并编码到视觉特征
+    #     x_src = data_dict['pixel_values_src'].to(dtype=self.dtype, device=self.device)  # 源图像批次，shape=[b_src, C, H, W]
+    #     x = data_dict['pixel_values'].to(dtype=self.dtype, device=self.device)          # 编辑图像批次，shape=[b_edit, C, H, W]
+    #     print_log(f"[DEBUG image_edit_loss] x_src.shape = {x_src.shape}, x.shape = {x.shape}", level="WARNING")
+    #     # b_edit 应该 >= b_src
+    #     assert x.shape[0] >= x_src.shape[0], \
+    #         f"编辑批次大小 ({x.shape[0]}) 必须 >= 源图像批次大小 ({x_src.shape[0]})"
+    #     # 拼接并一次性编码
+    #     x_all = torch.cat([x_src, x], dim=0)          # shape=[b_src + b_edit, C, H, W]
+    #     x_all = self.encode(x_all)                   # shape=[b_src + b_edit, m, n, c]
+    #     # 分割回源/编辑两部分
+    #     x_src_enc, x_enc = x_all.split([x_src.shape[0], x.shape[0]], dim=0)
+    #     # x_src_enc.shape=[b_src, m, n, c], x_enc.shape=[b_edit, m, n, c]
+    #     # 2. 提取视觉特征：x_con 用于 decoder 条件，z_src 用于填充文本中的 <image> token
+    #     x_con, z_src = self.extract_visual_feature(x_src_enc)
+    #     if self.grad_scale is not None:
+    #         x_con = _ScaleGradient.apply(x_con, self.grad_scale)
+    #         z_src = _ScaleGradient.apply(z_src, self.grad_scale)
+    #     # z_src.shape = [b_src, m*n, C]
+    #     # 3. 文本条件分支：构造 inputs_embeds
+    #     attention_mask = data_dict['attention_mask'].to(self.device)  # shape=[b_edit, seq_len]
+    #     input_ids      = data_dict['input_ids'].to(self.device)       # shape=[b_edit, seq_len]
+    #     b_edit, seq_len = input_ids.shape
+    #     hidden_size     = self.llm.config.hidden_size
+    #     # 先准备一个全 0 的 inputs_embeds
+    #     inputs_embeds = z_src.new_zeros(b_edit, seq_len, hidden_size)  # shape=[b_edit, seq_len, hidden_size]
+    #     # 找到所有 <image> token 位置的 mask
+    #     mask_imgpos = (input_ids == IMAGE_TOKEN_INDEX)                  # bool tensor [b_edit, seq_len]
+    #     # 需要将单个 z_src 展开成 b_edit 份，再按 mask_imgpos 填入
+    #     # 1) expand：把 z_src 从 [b_src, m*n, C] → [b_edit, m*n, C]
+    #     #    （一般 b_src=1，所以就是复制那一份）
+    #     z_src_rep = z_src.expand(b_edit, -1, -1)                        # [b_edit, m*n, C]
+    #     # 2) flatten：将二维展开到一维，对应 mask_imgpos.sum() 个位置
+    #     flat_z = z_src_rep.flatten(0, 1)                               # [b_edit*m*n, C]
+    #     # **重要检查**：保证 mask_imgpos 中 True 的数量 == flat_z.shape[0]
+    #     img_tokens_count = mask_imgpos.sum().item()
+    #     assert img_tokens_count == flat_z.shape[0], \
+    #         f"<image> token 数 ({img_tokens_count}) 不等于视觉特征数 ({flat_z.shape[0]})"
+    #     # 填充视觉 token 对应位置
+    #     inputs_embeds[mask_imgpos] = flat_z
+    #     # 剩下的位置用文本 embedding
+    #     txt_pos = ~mask_imgpos
+    #     txt_embeddings = self.llm.get_input_embeddings()(input_ids[txt_pos])
+    #     inputs_embeds[txt_pos] = txt_embeddings
+    #     # 4. MAE-style 重建分支：在 decoder 前注入 inputs_embeds 与 attention_mask
+    #     b, m, n, c = x_enc.shape
+    #     gt = x_enc.view(b, m*n, c)                                     # 作为重建目标
+    #     orders = self.mar.sample_orders(bsz=b, seq_len=m*n)
+    #     mask   = self.mar.random_masking(x_enc.flatten(1, 2), orders)
+    #     # 带条件的 encoder forward
+    #     x_enc_out = self.forward_mae_encoder(
+    #         x_enc,
+    #         mask,
+    #         inputs_embeds=inputs_embeds,
+    #         attention_mask=attention_mask
+    #     )
+    #     # decoder 重建
+    #     z_dec = self.mar.forward_mae_decoder(
+    #         x_enc_out,
+    #         mask,
+    #         image_shape=(m, n),
+    #         x_con=x_con
+    #     )
+    #     # 计算损失
+    #     loss = self.mar.forward_loss(z=z_dec, target=gt, mask=mask)
+    #     return loss
+    # def image_edit_loss_vae(self, data_dict):
+    #     """
+    #     计算图像编辑任务的损失。
+    #     参考图(x_src)的特征直接作为条件(x_con)送入解码器，不参与编码器重建。
+    #     编码器(encoder)仅在目标图(x_tgt)上进行掩码重建，并接收文本和参考图的上下文信息。
+    #     """
+    #     # === 步骤 1: 读入数据 ===
+    #     x_src = data_dict['pixel_values_src'].to(self.device).to(self.dtype)
+    #     x_tgt = data_dict['pixel_values'].to(self.device).to(self.dtype)
+    #     attention_mask = data_dict['attention_mask'].to(self.device)
+    #     input_ids = data_dict['input_ids'].to(self.device)
+    #     # IMG_TOKEN_INDEX = self.tokenizer.convert_tokens_to_ids("<image>")
+    #     B = x_tgt.shape[0]
+    #     # === 步骤 2: 处理参考图 (Reference Image) ===
+    #     # VAE编码，不计算梯度
+    #     with torch.no_grad():
+    #         z_src_latent = self.encode(x_src)  # [B, m, n, token_dim]
+    #     # 将VAE潜变量转换为解码器条件(x_con)和LLM输入(z_src_buf)
+    #     # 这一步实现了 "参考图潜变量 -> 解码器" 的直接通路
+    #     x_con, z_src_buf = self.vae_latent_to_decoder_feature(z_src_latent)
+    #     # x_con:    [B, 4096, enc_dim] -> 用于解码器
+    #     # z_src_buf: [B, 4160, llm_dim] -> 用于LLM
+    #     # === 步骤 3: 构建LLM的输入 (inputs_embeds) ===
+    #     # 结合文本指令(input_ids)和参考图特征(z_src_buf)
+    #     _, T = input_ids.shape
+    #     H_llm = self.llm.config.hidden_size
+    #     inputs_embeds = torch.zeros(B, T, H_llm, device=self.device, dtype=z_src_buf.dtype)
+    #     # 填充<image> token和文本token的嵌入
+    #     inputs_embeds[input_ids == IMG_TOKEN_INDEX] = z_src_buf.flatten(0, 1)
+    #     # input_ids 为33280
+    #     # z_src_buf.flatten(0, 1) 为33792 为什么 会比input_ids 多512个呢？
+    #     inputs_embeds[input_ids != IMG_TOKEN_INDEX] = self.llm.get_input_embeddings()(
+    #         input_ids[input_ids != IMG_TOKEN_INDEX]
+    #     )
+    #     # === 步骤 4: 处理目标图 (Target Image) 并进行编码器前向传播 ===
+    #     # VAE编码目标图，不计算梯度
+    #     with torch.no_grad():
+    #         z_tgt_latent = self.encode(x_tgt)  # [B, m, n, token_dim]
+    #     # 为目标图潜变量创建掩码(mask)以进行MAE重建
+    #     B, m, n, token_dim = z_tgt_latent.shape
+    #     patch_tokens_tgt = z_tgt_latent.view(B, m * n, token_dim)  # 作为重建的目标
+    #     orders = self.mar.sample_orders(bsz=B, seq_len=m * n)
+    #     mask = self.mar.random_masking(patch_tokens_tgt, orders)
+    #     # **核心**: 编码器只处理目标图(z_tgt_latent)的可见部分，并接收LLM的上下文
+    #     x_enc = self.forward_mae_encoder(
+    #         z_tgt_latent,  # 目标图潜变量
+    #         mask,
+    #         detach=False,
+    #         inputs_embeds=inputs_embeds,  # 包含文本和参考图信息的上下文
+    #         attention_mask=attention_mask
+    #     )
+    #     # === 步骤 5: 解码器重建 ===
+    #     # 解码器使用编码器的输出(x_enc)和参考图的特征(x_con)来重建完整的潜在表示
+    #     z_pred = self.mar.forward_mae_decoder(
+    #         x_enc,
+    #         mask,
+    #         image_shape=(m, n),
+    #         x_con=x_con  # ★ 参考图特征直接作用于此
+    #     )
+    #     # === 步骤 6: 计算损失 ===
+    #     loss = self.mar.forward_loss(
+    #         z=z_pred,
+    #         target=patch_tokens_tgt,
+    #         mask=mask
+    #     )
+    #     return loss
+    def image_edit_loss_contrastive(self, data_dict):
+        # Step 1: 获取图像特征
+        x_src = data_dict['pixel_values_src'].to(dtype=self.dtype, device=self.device)
+        x = data_dict['pixel_values'].to(dtype=self.dtype, device=self.device)
+        assert len(x_src) >= len(x)
+        x_src, x = self.encode(torch.cat([x_src, x])).split([len(x_src), len(x)], dim=0)
+        # Step 2: 文本输入部分
+        attention_mask = data_dict['attention_mask'].to(self.device)
+        input_ids = data_dict['input_ids'].to(self.device)
+        x_con, z_src = self.extract_visual_feature(x_src)
+        if self.grad_scale is not None:
+            z_src = _ScaleGradient.apply(z_src, self.grad_scale)
+            x_con = _ScaleGradient.apply(x_con, self.grad_scale)
+        inputs_embeds = z_src.new_zeros(*input_ids.shape, self.llm.config.hidden_size)
+        # IMAGE_TOKEN_INDEX = self.tokenizer.convert_tokens_to_ids("<image>")
+        inputs_embeds[input_ids == IMAGE_TOKEN_INDEX] = z_src.flatten(0, 1)
+        inputs_embeds[input_ids != IMAGE_TOKEN_INDEX] = self.llm.get_input_embeddings()(
+            input_ids[input_ids != IMAGE_TOKEN_INDEX]
+        )
+        # Step 3: 计算 reconstruction loss
+        b, m, n, _ = x.shape
+        gt_latents = x.clone().detach().view(b, m * n, -1)
+        orders = self.mar.sample_orders(bsz=b, seq_len=m*n)
+        mask = self.mar.random_masking(x.flatten(1, 2), orders)
+        x_enc = self.forward_mae_encoder(x, mask,
+                                        inputs_embeds=inputs_embeds,
+                                        attention_mask=attention_mask)
+        z = self.mar.forward_mae_decoder(x_enc, mask, image_shape=(m, n), x_con=x_con)
+        rec_loss = self.mar.forward_loss(z=z, target=gt_latents, mask=mask)
+        # Step 4: Contrastive loss between repeat and edit
+        # 假设 batch 是偶数，按 (repeat, edit) 对排列
+        z_src_flat = z_src.mean(dim=1)  # [B, D] 全局池化
+        z_src_flat = F.normalize(z_src_flat, dim=-1)
+        repeat_z = z_src_flat[::2]  # even index
+        edit_z = z_src_flat[1::2]   # odd index
+        logits = torch.matmul(edit_z, repeat_z.T) / 0.07  # [B, B]
+        labels = torch.arange(logits.size(0), device=logits.device)
+        contrastive_loss = F.cross_entropy(logits, labels)
+        return rec_loss + self.loss_weights.get("contrastive") * contrastive_loss
+    def image_edit_loss(self, data_dict):
+        # Multi-turn editing is also supported
+        x_src = data_dict['pixel_values_src'].to(dtype=self.dtype, device=self.device)
+        x = data_dict['pixel_values'].to(dtype=self.dtype, device=self.device)
+        # print_log(f"[DEBUG] x_src.shape = {x_src.shape}, x.shape = {x.shape}")
+        # assert len(x_src) >= len(x)
+        x_cat = torch.cat([x_src, x], dim=0)
+        x_src, x = self.encode(x_cat).split([len(x_src), len(x)], dim=0)
+        # Prepare context, including source images and instructions
+        attention_mask = data_dict['attention_mask'].to(self.device)
+        input_ids = data_dict['input_ids'].to(self.device)
+        x_con, z_src = self.extract_visual_feature(x_src)
+        if self.grad_scale is not None:
+            z_src = _ScaleGradient.apply(z_src, self.grad_scale)
+            x_con = _ScaleGradient.apply(x_con, self.grad_scale)
+        inputs_embeds = z_src.new_zeros(*input_ids.shape, self.llm.config.hidden_size)
+        self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+        IMAGE_TOKEN_INDEX = self.tokenizer.convert_tokens_to_ids("<image>")
+        # print("tokenizer idx in skywork_unipic_dev=", self.tokenizer.convert_tokens_to_ids("<image>"))
+        inputs_embeds[input_ids == IMAGE_TOKEN_INDEX] = z_src.flatten(0, 1)
+        inputs_embeds[input_ids != IMAGE_TOKEN_INDEX] = self.llm.get_input_embeddings()(
+            input_ids[input_ids != IMAGE_TOKEN_INDEX]
+        )
+        # --------------------------------------------------
+        # 3. MAE-style 重建
+        # --------------------------------------------------
+        b, m, n, _ = x.shape
+        gt_latents = x.clone().detach().view(b, m * n, -1)
+        orders = self.mar.sample_orders(bsz=b, seq_len=m*n)
+        mask = self.mar.random_masking(x.flatten(1, 2), orders)
+        x_enc = self.forward_mae_encoder(x, mask,
+                                         inputs_embeds=inputs_embeds,
+                                         attention_mask=attention_mask)
+        z = self.mar.forward_mae_decoder(x_enc, mask, image_shape=(m, n), x_con=x_con)
+        loss = self.mar.forward_loss(z=z, target=gt_latents, mask=mask)
+        return loss
+    def forward(self, data, data_samples=None, mode='loss'):
+        if mode == 'loss':
+            return self.compute_loss(data_dict=data)
+        else:
+            raise NotImplementedError
+    def compute_loss(self, data_dict):
+        losses = {}
+        for data_type, batch_data in data_dict.items():
+            if 'text2image' in data_type:
+                loss = self.text2image_loss(batch_data)
+            elif 'image2text' in data_type:
+                loss = self.image2text_loss(batch_data)
+            elif 'image_edit' in data_type:
+                loss = self.image_edit_loss(batch_data)
+            else:
+                raise NotImplementedError(f"Unknown data_type: {data_type}")
+            weight = self.loss_weights.get(data_type, 1.0)
+            losses[f'loss_{data_type}'] = loss * weight
+        return losses

src/models/skywork_unipic_ori.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import torch
+import math
+import numpy as np
+import torch.nn as nn
+import contextlib
+from einops import rearrange
+from transformers.cache_utils import DynamicCache
+from src.builder import BUILDER
+from tqdm import tqdm
+from torch.nn.utils.rnn import pad_sequence
+from transformers.integrations.deepspeed import (
+    is_deepspeed_zero3_enabled,
+    set_hf_deepspeed_config,
+    unset_hf_deepspeed_config,
+    deepspeed_config
+)
+@contextlib.contextmanager
+def temporarily_disable_deepspeed_zero3():
+    if is_deepspeed_zero3_enabled():
+        config = deepspeed_config()
+        print(f'[DEBUG] ds config={config}')
+        unset_hf_deepspeed_config()
+        yield
+        set_hf_deepspeed_config(config)
+    else:
+        yield
+def build_mlp(hidden_size, projector_dim, z_dim):
+    return nn.Sequential(
+        nn.Linear(hidden_size, projector_dim),
+        nn.SiLU(),
+        nn.Linear(projector_dim, z_dim),)
+def mask_by_order(mask_len, order, bsz, seq_len):
+    masking = torch.zeros(bsz, seq_len, device=order.device)
+    masking = torch.scatter(masking, dim=-1, index=order[:, :mask_len.long()],
+                            src=torch.ones(bsz, seq_len, device=order.device)).bool()
+    return masking
+class SkyworkUnipic(nn.Module):
+    def __init__(self,
+                 vae,
+                 vae_scale,
+                 llm,
+                 mar,
+                 tokenizer,
+                 prompt_template):
+        super().__init__()
+        with temporarily_disable_deepspeed_zero3():
+            # VAE
+            self.vae = BUILDER.build(vae)
+            self.vae.requires_grad_(False)
+            self.vae_scale = vae_scale
+            # LLM
+            self.llm = BUILDER.build(llm)
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.prompt_template = prompt_template
+        self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+        image_token_idx = self.tokenizer.convert_tokens_to_ids("<image>")
+        print(f"Registered <image> token at index {image_token_idx}")
+        # MAR
+        self.mar = BUILDER.build(mar)
+        # projection layers
+        self.proj_in = build_mlp(hidden_size=self.mar.encoder_embed_dim,
+                                 projector_dim=self.llm.config.hidden_size,
+                                 z_dim=self.llm.config.hidden_size)
+        self.proj_out = build_mlp(hidden_size=self.llm.config.hidden_size,
+                                  projector_dim=self.llm.config.hidden_size,
+                                  z_dim=self.mar.encoder_embed_dim)
+    @property
+    def llm_model(self):
+        return self.llm.model
+    @property
+    def device(self):
+        return self.llm.device
+    @property
+    def dtype(self):
+        return self.llm.dtype
+    @property
+    def gen_seq_len(self):
+        return self.mar.seq_len
+    @property
+    def token_embed_dim(self):
+        return self.vae.embed_dim * (self.mar.patch_size ** 2)
+    @torch.no_grad()
+    def encode(self, x):
+        posterior = self.vae.encode(x)
+        z = posterior.mode().mul_(self.vae_scale)
+        z = rearrange(z, 'b c (m p) (n q) -> b m n (c p q)',
+                      p=self.mar.patch_size, q=self.mar.patch_size)
+        return z
+    @torch.no_grad()
+    def decode(self, z):
+        z /= self.vae_scale
+        z = rearrange(z, 'b m n (c p q) -> b c (m p) (n q)',
+                      p=self.mar.patch_size, q=self.mar.patch_size)
+        x = self.vae.decode(z)
+        return x
+    def prepare_forward_input(self,
+                              x,
+                              inputs_embeds=None,
+                              input_ids=None,
+                              attention_mask=None,
+                              past_key_values=None):
+        b, l, _ = x.shape
+        attention_mask = attention_mask.to(device=self.device, dtype=torch.bool)
+        attention_mask = torch.cat([
+            attention_mask, attention_mask.new_ones(b, l)
+        ], dim=1)
+        position_ids = torch.cumsum(attention_mask, dim=1) - 1
+        position_ids[position_ids < 0] = 0
+        # import pdb; pdb.set_trace()
+        # prepare context
+        if past_key_values is not None:
+            inputs_embeds = x
+            position_ids = position_ids[:, -l:]
+        else:
+            if inputs_embeds is None:
+                input_ids = input_ids.to(self.device)
+                inputs_embeds = self.llm.get_input_embeddings()(input_ids)
+            inputs_embeds = torch.cat([inputs_embeds, x], dim=1)
+        return dict(inputs_embeds=inputs_embeds,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values)
+    def extract_visual_feature(self, x, mask=None, detach=False):
+        b, m, n, _ = x.shape
+        x = x.view(b, m*n, -1)
+        # x: b mn c
+        if mask is None:
+            mask = torch.zeros_like(x[..., 0])
+        null_embeds = self.mar.fake_latent.expand(x.shape[0], -1)
+        x_enc = self.mar.forward_mae_encoder(x, mask, null_embeds, image_shape=(m, n))
+        z_enc = self.proj_in(x_enc)
+        # Move buffers to the end of the image sequence
+        z_enc = torch.cat([
+            z_enc[:, self.mar.buffer_size:],
+            z_enc[:, :self.mar.buffer_size]], dim=1)
+        if detach:
+            x_enc = x_enc.detach()
+            z_enc = z_enc.detach()
+        return x_enc, z_enc
+    def vae_latent_to_decoder_feature(self, z_src_latent):
+        """
+        Returns:
+        x_con     [B, buf_sz + m*n, enc_dim]   for the MAE decoder
+        z_src_buf [B, buf_sz + m*n, llm_dim]   to scatter into <image> tokens
+        """
+        B, m, n, token_dim = z_src_latent.shape
+        num_patches = m * n
+        enc_dim  = self.mar.encoder_embed_dim      # e.g. 1280
+        llm_dim  = self.llm.config.hidden_size     # e.g. 1536
+        buf_sz   = self.mar.buffer_size            # e.g.   64
+        # 1) flatten patches → [B,4096,token_dim]
+        patch_tokens = z_src_latent.view(B, num_patches, token_dim)
+        # 2) project to encoder dim → [B,4096,enc_dim]
+        z_enc = self.mar.z_proj(patch_tokens)
+        z_enc = self.mar.z_proj_ln(z_enc)
+        # (optional) add encoder pos embed for image part only
+        full_pos = self.mar.get_encoder_pos_embed(h=m, w=n)  # [1, buf_sz+4096, enc_dim]
+        pos_img  = full_pos[:, buf_sz:]                     # [1,4096,enc_dim]
+        z_enc = z_enc + pos_img
+        # 3) build x_con for MAE decoder: **one** buffer pad + image tokens
+        buf_enc = torch.zeros(B, buf_sz, enc_dim,
+                            device=z_enc.device, dtype=z_enc.dtype)
+        x_con = torch.cat([buf_enc, z_enc], dim=1)           # [B,4160,enc_dim]
+        # 4) build z_src_buf for LLM: **project the exact same** x_con, then rotate buffer→end
+        z_proj_llm = self.proj_in(x_con)                     # [B,4160,llm_dim]
+        # rotate: take image portion then buffer portion
+        z_src_buf = torch.cat([
+            z_proj_llm[:, buf_sz:],  # [B,4096,llm_dim]
+            z_proj_llm[:, :buf_sz]   # [B,  64,llm_dim]
+        ], dim=1)                                            # [B,4160,llm_dim]
+        return x_con, z_src_buf
+    def forward_mae_encoder(self, x, mask, detach=False, **context):
+        b, m, n, _ = x.shape
+        x_enc, z_enc = self.extract_visual_feature(x, mask=mask, detach=detach)
+        inputs = self.prepare_forward_input(x=z_enc, **context)
+        output = self.llm_model(**inputs, return_dict=True)
+        z_llm = output.last_hidden_state[:, -z_enc.shape[1]:]
+        # move buffers back to the start of the image sequence
+        z_llm = torch.cat([
+            z_llm[:, -self.mar.buffer_size:],
+            z_llm[:, :-self.mar.buffer_size]], dim=1)
+        # residual learning
+        x_enc = x_enc + self.proj_out(z_llm)
+        return x_enc
+    @staticmethod
+    def curtail_cache(past_key_values, cur_len):
+        for past_key_values_ in past_key_values:
+            keys, values = past_key_values_
+            keys.data = keys.data[:, :, :cur_len]
+            values.data = values.data[:, :, :cur_len]
+    @torch.no_grad()
+    def prepare_text_conditions(self, prompt, cfg_prompt='Generate an image.'):
+        all_prompts = [self.prompt_template['INSTRUCTION'].format(input=prompt),
+                       self.prompt_template['INSTRUCTION'].format(input=cfg_prompt)]
+        input_ids = [self.tokenizer.encode(p, add_special_tokens=True, return_tensors='pt')[0]
+                     for p in all_prompts]
+        valid_lens = [len(input_ids_) for input_ids_ in input_ids]
+        input_ids = pad_sequence(input_ids, batch_first=True,
+                                 padding_value=self.tokenizer.eos_token_id)
+        attention_mask = torch.zeros_like(input_ids).bool()
+        for i in range(len(input_ids)):
+            attention_mask[i, :valid_lens[i]] = True
+        return dict(input_ids=input_ids.to(self.device),
+                    attention_mask=attention_mask.to(self.device))
+    @torch.no_grad()
+    def sample(self,
+               input_ids=None, inputs_embeds=None,
+               attention_mask=None, num_iter=64, cfg=1.0, cfg_schedule="constant", temperature=1.0,
+               progress=False, mask=None, past_key_values=None, image_shape=None, x_con=None, **kwargs):
+        if inputs_embeds is None and input_ids is not None:
+            inputs_embeds = self.llm.get_input_embeddings()(input_ids)
+        bsz = attention_mask.shape[0]
+        if cfg != 1.0:
+            assert bsz % 2 == 0
+        if image_shape is None:
+            m = n = int(self.gen_seq_len ** 0.5)
+        else:
+            m, n = image_shape
+        if mask is None:
+            mask = torch.ones(bsz, m*n, device=self.device, dtype=self.dtype)
+        else:
+            mask = mask.view(bsz, m*n)
+        tokens = torch.zeros(bsz, m*n, self.token_embed_dim,
+                             device=self.device, dtype=self.dtype)
+        orders = self.mar.sample_orders(bsz, seq_len=m*n)
+        if cfg != 1.0:
+            orders[bsz//2:] = orders[:bsz//2]
+        indices = list(range(num_iter))
+        if progress:
+            indices = tqdm(indices)
+        # past key values can be prepared outside (usually in multi-turn editing)
+        if past_key_values is None:
+            output = self.llm_model(inputs_embeds=inputs_embeds,
+                                    attention_mask=None,
+                                    position_ids=None,
+                                    past_key_values=DynamicCache.from_legacy_cache(),
+                                    return_dict=True,
+                                    use_cache=True)
+            past_key_values = output.past_key_values
+        # generate latents
+        for step in indices:
+            cur_tokens = tokens.clone()
+            x_enc = self.forward_mae_encoder(tokens.view(bsz, m, n, -1),
+                                             mask.to(self.dtype),
+                                             past_key_values=past_key_values,
+                                             # inputs_embeds=inputs_embeds,
+                                             attention_mask=attention_mask)
+            # import pdb; pdb.set_trace()
+            self.curtail_cache(past_key_values, inputs_embeds.shape[1])
+            # import pdb; pdb.set_trace()
+            z = self.mar.forward_mae_decoder(x_enc, mask.to(self.dtype), image_shape=(m, n), x_con=x_con)
+            # mask ratio for the next round, following MaskGIT and MAGE.
+            mask_ratio = np.cos(math.pi / 2. * (step + 1) / num_iter)
+            mask_len = torch.Tensor([np.floor(m*n * mask_ratio)]).to(self.device)
+            # masks out at least one for the next iteration
+            mask_len = torch.maximum(torch.Tensor([1]).to(self.device),
+                                     torch.minimum(torch.sum(mask, dim=-1, keepdims=True) - 1, mask_len))
+            # get masking for next iteration and locations to be predicted in this iteration
+            mask_next = mask_by_order(mask_len[0], orders, bsz, m*n).to(self.device)
+            if cfg != 1.0:
+                mask_next[bsz//2:] = mask_next[:bsz//2]
+            if step >= num_iter - 1:
+                mask_to_pred = mask[:bsz].bool()
+            else:
+                mask_to_pred = torch.logical_xor(mask[:bsz].bool(), mask_next.bool())
+            mask = mask_next
+            # if not cfg == 1.0:
+            #     mask_to_pred = torch.cat([mask_to_pred, mask_to_pred], dim=0)
+            # sample token latents for this step
+            z = z[mask_to_pred.nonzero(as_tuple=True)]
+            # cfg schedule follow Muse
+            if cfg_schedule == "linear":
+                cfg_iter = 1 + (cfg - 1) * (m*n - mask_len[0]) / (m*n)
+            elif cfg_schedule == "constant":
+                cfg_iter = cfg
+            else:
+                raise NotImplementedError
+            sampled_token_latent = self.mar.diffloss.sample(z, temperature, cfg_iter).to(self.dtype)
+            # if not cfg == 1.0:
+            #     sampled_token_latent, _ = sampled_token_latent.chunk(2, dim=0)  # Remove null class samples
+            #     mask_to_pred, _ = mask_to_pred.chunk(2, dim=0)
+            cur_tokens[mask_to_pred.nonzero(as_tuple=True)] = sampled_token_latent
+            if cfg != 1.0:
+                cur_tokens[bsz//2:] = cur_tokens[:bsz//2]
+            tokens = cur_tokens.clone()
+        pred = self.decode(tokens.view(bsz, m, n, -1))
+        if cfg != 1.0:
+            pred = pred[:bsz//2]
+        return pred

src/models/skywork_unipic_siglip.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import torch
+import math
+import numpy as np
+import torch.nn as nn
+from einops import rearrange
+from transformers.cache_utils import DynamicCache
+from src.builder import BUILDER
+from tqdm import tqdm
+from torch.nn.utils.rnn import pad_sequence
+def build_mlp(hidden_size, projector_dim, z_dim):
+    return nn.Sequential(
+        nn.Linear(hidden_size, projector_dim),
+        nn.SiLU(),
+        nn.Linear(projector_dim, z_dim),
+    )
+def mask_by_order(mask_len, order, bsz, seq_len):
+    masking = torch.zeros(bsz, seq_len, device=order.device)
+    masking = torch.scatter(
+        masking,
+        dim=-1,
+        index=order[:, : mask_len.long()],
+        src=torch.ones(bsz, seq_len, device=order.device),
+    ).bool()
+    return masking
+class SkyworkUnipic(nn.Module):
+    def __init__(self, vae, vae_scale, llm, mar, tokenizer, prompt_template, siglip2):
+        super().__init__()
+        # VAE
+        self.vae = BUILDER.build(vae)
+        self.vae.requires_grad_(False)
+        self.vae_scale = vae_scale
+        # LLM
+        self.llm = BUILDER.build(llm)
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.tokenizer.add_tokens(["<image>"], special_tokens=True)
+        self.image_token_idx = self.tokenizer.convert_tokens_to_ids("<image>")
+        self.prompt_template = prompt_template
+        # MAR
+        self.mar = BUILDER.build(mar)
+        # projection layers
+        self.proj_in = build_mlp(
+            hidden_size=self.mar.encoder_embed_dim,
+            projector_dim=self.llm.config.hidden_size,
+            z_dim=self.llm.config.hidden_size,
+        )
+        self.proj_out = build_mlp(
+            hidden_size=self.llm.config.hidden_size,
+            projector_dim=self.llm.config.hidden_size,
+            z_dim=self.mar.encoder_embed_dim,
+        )
+        # siglip
+        self.siglip2 = BUILDER.build(siglip2)
+        self.siglip2_proj = build_mlp(
+            hidden_size=1152,
+            projector_dim=self.llm.config.hidden_size,
+            z_dim=self.llm.config.hidden_size,
+        )
+    @property
+    def llm_model(self):
+        return self.llm.model
+    @property
+    def device(self):
+        return self.llm.device
+    @property
+    def dtype(self):
+        return self.llm.dtype
+    @property
+    def gen_seq_len(self):
+        return self.mar.seq_len
+    @property
+    def token_embed_dim(self):
+        return self.vae.embed_dim * (self.mar.patch_size**2)
+    @torch.no_grad()
+    def encode(self, x):
+        posterior = self.vae.encode(x)
+        z = posterior.mode().mul_(self.vae_scale)
+        z = rearrange(
+            z,
+            "b c (m p) (n q) -> b m n (c p q)",
+            p=self.mar.patch_size,
+            q=self.mar.patch_size,
+        )
+        return z
+    @torch.no_grad()
+    def decode(self, z):
+        z /= self.vae_scale
+        z = rearrange(
+            z,
+            "b m n (c p q) -> b c (m p) (n q)",
+            p=self.mar.patch_size,
+            q=self.mar.patch_size,
+        )
+        x = self.vae.decode(z)
+        return x
+    def prepare_forward_input(
+        self,
+        x,
+        inputs_embeds=None,
+        input_ids=None,
+        attention_mask=None,
+        past_key_values=None,
+    ):
+        b, l, _ = x.shape
+        attention_mask = attention_mask.to(device=self.device, dtype=torch.bool)
+        attention_mask = torch.cat(
+            [attention_mask, attention_mask.new_ones(b, l)], dim=1
+        )
+        position_ids = torch.cumsum(attention_mask, dim=1) - 1
+        position_ids[position_ids < 0] = 0
+        # import pdb; pdb.set_trace()
+        # prepare context
+        if past_key_values is not None:
+            inputs_embeds = x
+            position_ids = position_ids[:, -l:]
+        else:
+            if inputs_embeds is None:
+                input_ids = input_ids.to(self.device)
+                inputs_embeds = self.llm.get_input_embeddings()(input_ids)
+            inputs_embeds = torch.cat([inputs_embeds, x], dim=1)
+        return dict(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+        )
+    def extract_visual_feature(self, x, mask=None, detach=False):
+        b, m, n, _ = x.shape
+        x = x.view(b, m * n, -1)
+        # x: b mn c
+        if mask is None:
+            mask = torch.zeros_like(x[..., 0])
+        null_embeds = self.mar.fake_latent.expand(x.shape[0], -1)
+        x_enc = self.mar.forward_mae_encoder(x, mask, null_embeds, image_shape=(m, n))
+        z_enc = self.proj_in(x_enc)
+        # Move buffers to the end of the image sequence
+        z_enc = torch.cat(
+            [z_enc[:, self.mar.buffer_size :], z_enc[:, : self.mar.buffer_size]], dim=1
+        )
+        if detach:
+            x_enc = x_enc.detach()
+            z_enc = z_enc.detach()
+        return x_enc, z_enc
+    def forward_mae_encoder(self, x, mask, detach=False, **context):
+        b, m, n, _ = x.shape
+        x_enc, z_enc = self.extract_visual_feature(x, mask=mask, detach=detach)
+        inputs = self.prepare_forward_input(x=z_enc, **context)
+        output = self.llm_model(**inputs, return_dict=True)
+        z_llm = output.last_hidden_state[:, -z_enc.shape[1] :]
+        # move buffers back to the start of the image sequence
+        z_llm = torch.cat(
+            [z_llm[:, -self.mar.buffer_size :], z_llm[:, : -self.mar.buffer_size]],
+            dim=1,
+        )
+        # residual learning
+        x_enc = x_enc + self.proj_out(z_llm)
+        return x_enc
+    @staticmethod
+    def curtail_cache(past_key_values, cur_len):
+        for past_key_values_ in past_key_values:
+            keys, values = past_key_values_
+            keys.data = keys.data[:, :, :cur_len]
+            values.data = values.data[:, :, :cur_len]
+    @torch.no_grad()
+    def prepare_text_conditions(self, prompt, cfg_prompt="Generate an image."):
+        all_prompts = [
+            self.prompt_template["INSTRUCTION"].format(input=prompt),
+            self.prompt_template["INSTRUCTION"].format(input=cfg_prompt),
+        ]
+        input_ids = [
+            self.tokenizer.encode(p, add_special_tokens=True, return_tensors="pt")[0]
+            for p in all_prompts
+        ]
+        valid_lens = [len(input_ids_) for input_ids_ in input_ids]
+        input_ids = pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id
+        )
+        attention_mask = torch.zeros_like(input_ids).bool()
+        for i in range(len(input_ids)):
+            attention_mask[i, : valid_lens[i]] = True
+        return dict(
+            input_ids=input_ids.to(self.device),
+            attention_mask=attention_mask.to(self.device),
+        )
+    @torch.no_grad()
+    def sample(
+        self,
+        input_ids=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        num_iter=64,
+        cfg=1.0,
+        cfg_schedule="constant",
+        temperature=1.0,
+        progress=False,
+        mask=None,
+        past_key_values=None,
+        image_shape=None,
+        x_con=None,
+        **kwargs,
+    ):
+        if inputs_embeds is None and input_ids is not None:
+            inputs_embeds = self.llm.get_input_embeddings()(input_ids)
+        bsz = attention_mask.shape[0]
+        if cfg != 1.0:
+            assert bsz % 2 == 0
+        if image_shape is None:
+            m = n = int(self.gen_seq_len**0.5)
+        else:
+            m, n = image_shape
+        if mask is None:
+            mask = torch.ones(bsz, m * n, device=self.device, dtype=self.dtype)
+        else:
+            mask = mask.view(bsz, m * n)
+        tokens = torch.zeros(
+            bsz, m * n, self.token_embed_dim, device=self.device, dtype=self.dtype
+        )
+        orders = self.mar.sample_orders(bsz, seq_len=m * n)
+        if cfg != 1.0:
+            orders[bsz // 2 :] = orders[: bsz // 2]
+        indices = list(range(num_iter))
+        if progress:
+            indices = tqdm(indices)
+        # past key values can be prepared outside (usually in multi-turn editing)
+        if past_key_values is None:
+            output = self.llm_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=DynamicCache.from_legacy_cache(),
+                return_dict=True,
+                use_cache=True,
+            )
+            past_key_values = output.past_key_values
+        # generate latents
+        for step in indices:
+            cur_tokens = tokens.clone()
+            x_enc = self.forward_mae_encoder(
+                tokens.view(bsz, m, n, -1),
+                mask.to(self.dtype),
+                past_key_values=past_key_values,
+                # inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+            )
+            # import pdb; pdb.set_trace()
+            self.curtail_cache(past_key_values, inputs_embeds.shape[1])
+            # import pdb; pdb.set_trace()
+            z = self.mar.forward_mae_decoder(
+                x_enc, mask.to(self.dtype), image_shape=(m, n), x_con=x_con
+            )
+            # mask ratio for the next round, following MaskGIT and MAGE.
+            mask_ratio = np.cos(math.pi / 2.0 * (step + 1) / num_iter)
+            mask_len = torch.Tensor([np.floor(m * n * mask_ratio)]).to(self.device)
+            # masks out at least one for the next iteration
+            mask_len = torch.maximum(
+                torch.Tensor([1]).to(self.device),
+                torch.minimum(torch.sum(mask, dim=-1, keepdims=True) - 1, mask_len),
+            )
+            # get masking for next iteration and locations to be predicted in this iteration
+            mask_next = mask_by_order(mask_len[0], orders, bsz, m * n).to(self.device)
+            if cfg != 1.0:
+                mask_next[bsz // 2 :] = mask_next[: bsz // 2]
+            if step >= num_iter - 1:
+                mask_to_pred = mask[:bsz].bool()
+            else:
+                mask_to_pred = torch.logical_xor(mask[:bsz].bool(), mask_next.bool())
+            mask = mask_next
+            # if not cfg == 1.0:
+            #     mask_to_pred = torch.cat([mask_to_pred, mask_to_pred], dim=0)
+            # sample token latents for this step
+            z = z[mask_to_pred.nonzero(as_tuple=True)]
+            # cfg schedule follow Muse
+            if cfg_schedule == "linear":
+                cfg_iter = 1 + (cfg - 1) * (m * n - mask_len[0]) / (m * n)
+            elif cfg_schedule == "constant":
+                cfg_iter = cfg
+            else:
+                raise NotImplementedError
+            sampled_token_latent = self.mar.diffloss.sample(
+                z, temperature, cfg_iter
+            ).to(self.dtype)
+            # if not cfg == 1.0:
+            #     sampled_token_latent, _ = sampled_token_latent.chunk(2, dim=0)  # Remove null class samples
+            #     mask_to_pred, _ = mask_to_pred.chunk(2, dim=0)
+            cur_tokens[mask_to_pred.nonzero(as_tuple=True)] = sampled_token_latent
+            if cfg != 1.0:
+                cur_tokens[bsz // 2 :] = cur_tokens[: bsz // 2]
+            tokens = cur_tokens.clone()
+        pred = self.decode(tokens.view(bsz, m, n, -1))
+        if cfg != 1.0:
+            pred = pred[: bsz // 2]
+        return pred

src/optimisers/constructor.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import inspect
+import torch.nn as nn
+from typing import List, Optional, Union
+from mmengine.optim import DefaultOptimWrapperConstructor, OptimWrapper
+from mmengine.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                               OPTIMIZERS)
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list or 'diffloss' in name:
+            no_decay.append(param)  # no weight decay on bias, norm and diffloss
+        else:
+            decay.append(param)
+    num_decay_params = sum(p.numel() for p in decay)
+    num_nodecay_params = sum(p.numel() for p in no_decay)
+    print(f"num decayed parameter tensors: {len(decay)}, with {num_decay_params:,} parameters")
+    print(f"num non-decayed parameter tensors: {len(no_decay)}, with {num_nodecay_params:,} parameters")
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]
+class MAROptimWrapperConstructor(DefaultOptimWrapperConstructor):
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+        if hasattr(model, 'module'):
+            model = model.module
+        optim_wrapper_cfg = self.optim_wrapper_cfg.copy()
+        optim_wrapper_cfg.setdefault('type', 'OptimWrapper')
+        optimizer_cfg = self.optimizer_cfg.copy()
+        optimizer_cls = self.optimizer_cfg['type']
+        # Optimizer like HybridAdam in colossalai requires the argument name
+        # `model_params` rather than `params`. Here we get the first argument
+        # name and fill it with the model parameters.
+        if isinstance(optimizer_cls, str):
+            with OPTIMIZERS.switch_scope_and_registry(None) as registry:
+                optimizer_cls = registry.get(self.optimizer_cfg['type'])
+        fisrt_arg_name = next(
+            iter(inspect.signature(optimizer_cls).parameters))
+        # import pdb; pdb.set_trace()
+        param_groups = add_weight_decay(model, optimizer_cfg.pop('weight_decay', 0))
+        optimizer_cfg[fisrt_arg_name] = param_groups
+        optimizer = OPTIMIZERS.build(optimizer_cfg)
+        # # if no paramwise option is specified, just use the global setting
+        # if not self.paramwise_cfg:
+        #     optimizer_cfg[fisrt_arg_name] = model.parameters()
+        #     optimizer = OPTIMIZERS.build(optimizer_cfg)
+        # else:
+        #     # set param-wise lr and weight decay recursively
+        #     params: List = []
+        #     self.add_params(params, model)
+        #     optimizer_cfg[fisrt_arg_name] = params
+        #     optimizer = OPTIMIZERS.build(optimizer_cfg)
+        optim_wrapper = OPTIM_WRAPPERS.build(
+            optim_wrapper_cfg, default_args=dict(optimizer=optimizer))
+        return optim_wrapper

src/optimisers/custom_adamw.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import inspect
+from torch.optim import AdamW
+class CustomAdamW(AdamW):
+    def __init__(self, params, weight_decay, *args, **kwargs):
+        import pdb; pdb.set_trace()
+        if isinstance(params, dict):
+            params = [p for p in params.values() if p.requires_grad]
+        else:
+            params = [p for p in params if p.requires_grad]
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for p in params if p.dim() >= 2]
+        nodecay_params = [p for p in params if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        # fused_available = 'fused' in inspect.signature(AdamW).parameters
+        # extra_args = dict(fused=True) if fused_available else dict()
+        # print(f"using fused AdamW: {fused_available}")
+        # kwargs.update(extra_args)
+        super().__init__(params=optim_groups, *args, **kwargs)

src/runners/custom_runner.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import copy
+import logging
+import inspect
+from torch.utils.data import DataLoader
+from functools import partial
+from typing import Callable, Dict, List, Optional, Union
+from mmengine.logging import print_log
+from mmengine.dist import get_rank
+from mmengine.dataset import worker_init_fn as default_worker_init_fn
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.runner import FlexibleRunner
+from mmengine.registry import (
+    DATA_SAMPLERS,
+    DATASETS,
+    FUNCTIONS,
+)
+from xtuner.registry import BUILDER
+def clean_concatdataset_fields(cfg):
+    """
+    递归清除所有 ConcatDataset 配置中的非法字段（如 image_size）
+    """
+    if isinstance(cfg, dict):
+        # 如果是 ConcatDataset 层，清除非法字段
+        if cfg.get('type') == "ConcatDataset":
+            for key in ['image_size']:
+                if key in cfg:
+                    del cfg[key]
+        # 递归处理子字段
+        for k, v in cfg.items():
+            clean_concatdataset_fields(v)
+    elif isinstance(cfg, list):
+        for item in cfg:
+            clean_concatdataset_fields(item)
+    return cfg
+class CustomRunner(FlexibleRunner):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+    @staticmethod
+    def build_dataloader(
+        dataloader: Union[DataLoader, Dict],
+        seed: Optional[int] = None,
+        diff_rank_seed: bool = False,
+    ) -> DataLoader:
+        """Build dataloader.
+        The method builds three components:
+        - Dataset
+        - Sampler
+        - Dataloader
+        An example of ``dataloader``::
+            dataloader = dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=True),
+                batch_size=1,
+                num_workers=9
+            )
+        Args:
+            dataloader (DataLoader or dict): A Dataloader object or a dict to
+                build Dataloader object. If ``dataloader`` is a Dataloader
+                object, just returns itself.
+            seed (int, optional): Random seed. Defaults to None.
+            diff_rank_seed (bool): Whether or not set different seeds to
+                different ranks. If True, the seed passed to sampler is set
+                to None, in order to synchronize the seeds used in samplers
+                across different ranks. Defaults to False.
+        Returns:
+            Dataloader: DataLoader build from ``dataloader_cfg``.
+        """
+        if isinstance(dataloader, DataLoader):
+            return dataloader
+        dataloader_cfg = copy.deepcopy(dataloader)
+        clean_concatdataset_fields(dataloader_cfg)
+        # build dataset
+        dataset_cfg = dataloader_cfg.pop('dataset')
+        if isinstance(dataset_cfg, dict):
+            dataset = DATASETS.build(dataset_cfg)
+            if hasattr(dataset, 'full_init'):
+                dataset.full_init()
+        else:
+            # fallback to raise error in dataloader
+            # if `dataset_cfg` is not a valid type
+            dataset = dataset_cfg
+        # build sampler
+        sampler_cfg = dataloader_cfg.pop('sampler')
+        if isinstance(sampler_cfg, dict):
+            sampler_seed = None if diff_rank_seed else seed
+            sampler = DATA_SAMPLERS.build(
+                sampler_cfg,
+                default_args=dict(dataset=dataset, seed=sampler_seed))
+        else:
+            # fallback to raise error in dataloader
+            # if `sampler_cfg` is not a valid type
+            sampler = sampler_cfg
+        # build batch sampler
+        batch_sampler_cfg = dataloader_cfg.pop('batch_sampler', None)
+        if batch_sampler_cfg is None:
+            batch_sampler = None
+        elif isinstance(batch_sampler_cfg, dict):
+            batch_sampler = DATA_SAMPLERS.build(
+                batch_sampler_cfg,
+                default_args=dict(
+                    dataset=dataset,
+                    sampler=sampler,
+                    batch_size=dataloader_cfg.pop('batch_size')))
+        else:
+            # fallback to raise error in dataloader
+            # if `batch_sampler_cfg` is not a valid type
+            batch_sampler = batch_sampler_cfg
+        # build dataloader
+        init_fn: Optional[partial]
+        if 'worker_init_fn' in dataloader_cfg:
+            worker_init_fn_cfg = dataloader_cfg.pop('worker_init_fn')
+            worker_init_fn_type = worker_init_fn_cfg.pop('type')
+            worker_init_fn = FUNCTIONS.get(worker_init_fn_type)
+            assert callable(worker_init_fn)
+            init_fn = partial(worker_init_fn,
+                              **worker_init_fn_cfg)  # type: ignore
+        else:
+            if seed is not None:
+                disable_subprocess_warning = dataloader_cfg.pop(
+                    'disable_subprocess_warning', False)
+                assert isinstance(disable_subprocess_warning, bool), (
+                    'disable_subprocess_warning should be a bool, but got '
+                    f'{type(disable_subprocess_warning)}')
+                init_fn = partial(
+                    default_worker_init_fn,
+                    num_workers=dataloader_cfg.get('num_workers'),
+                    rank=get_rank(),
+                    seed=seed,
+                    disable_subprocess_warning=disable_subprocess_warning)
+            else:
+                init_fn = None
+        # `persistent_workers` requires pytorch version >= 1.7
+        if ('persistent_workers' in dataloader_cfg
+                and digit_version(TORCH_VERSION) < digit_version('1.7.0')):
+            print_log(
+                '`persistent_workers` is only available when '
+                'pytorch version >= 1.7',
+                logger='current',
+                level=logging.WARNING)
+            dataloader_cfg.pop('persistent_workers')
+        # The default behavior of `collat_fn` in dataloader is to
+        # merge a list of samples to form a mini-batch of Tensor(s).
+        # However, in mmengine, if `collate_fn` is not defined in
+        # dataloader_cfg, `pseudo_collate` will only convert the list of
+        # samples into a dict without stacking the batch tensor.
+        collate_fn_cfg = dataloader_cfg.pop('collate_fn',
+                                            dict(type='pseudo_collate'))
+        if isinstance(collate_fn_cfg, dict):
+            collate_fn_type = collate_fn_cfg.pop('type')
+            if isinstance(collate_fn_type, str):
+                collate_fn = FUNCTIONS.get(collate_fn_type)
+            elif inspect.isclass(collate_fn_type):
+                collate_fn_cfg['type'] = collate_fn_type
+                collate_fn = BUILDER.build(collate_fn_cfg)
+            else:
+                collate_fn = collate_fn_type
+            if not inspect.isclass(collate_fn_type):
+                collate_fn = partial(collate_fn, **collate_fn_cfg)  # type: ignore
+        elif callable(collate_fn_cfg):
+            collate_fn = collate_fn_cfg
+        else:
+            raise TypeError(
+                'collate_fn should be a dict or callable object, but got '
+                f'{collate_fn_cfg}')
+        data_loader = DataLoader(
+            dataset=dataset,
+            sampler=sampler if batch_sampler is None else None,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn,
+            worker_init_fn=init_fn,
+            **dataloader_cfg)
+        return data_loader