Spaces:

nev
/

CoNR

Runtime error

App Files Files Community

p2oileen commited on Jul 13, 2022

Commit

c34ed4d

1 Parent(s): 18b58f0

initial commit

Browse files

Files changed (11) hide show

.gitignore +11 -0
conr.py +292 -0
data_loader.py +273 -0
infer.sh +14 -0
model/__init__.py +1 -0
model/backbone.py +285 -0
model/decoder_small.py +43 -0
model/shader.py +290 -0
model/warplayer.py +56 -0
streamlit.py +52 -0
train.py +229 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+results/
+test_data/
+test_data_pre/
+weights/
+x264/
+*.mp3
+*.mp4
+*.txt
+*.png
+complex_infer.sh
+__pycache__/

conr.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import os
+import torch
+from model.backbone import ResEncUnet
+from model.shader import CINN
+from model.decoder_small import RGBADecoderNet
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def UDPClip(x):
+    return torch.clamp(x, min=0, max=1)  # NCHW
+class CoNR():
+    def __init__(self, args):
+        self.args = args
+        self.udpparsernet = ResEncUnet(
+            backbone_name='resnet50_danbo',
+            classes=4,
+            pretrained=(args.local_rank == 0),
+            parametric_upsampling=True,
+            decoder_filters=(512, 384, 256, 128, 32),
+            map_location=device
+        )
+        self.target_pose_encoder = ResEncUnet(
+            backbone_name='resnet18_danbo-4',
+            classes=1,
+            pretrained=(args.local_rank == 0),
+            parametric_upsampling=True,
+            decoder_filters=(512, 384, 256, 128, 32),
+            map_location=device
+        )
+        self.DIM_SHADER_REFERENCE = 4
+        self.shader = CINN(self.DIM_SHADER_REFERENCE)
+        self.rgbadecodernet = RGBADecoderNet(
+        )
+        self.device()
+        self.parser_ckpt = None
+    def dist(self):
+        args = self.args
+        if args.distributed:
+            self.udpparsernet = torch.nn.parallel.DistributedDataParallel(
+                self.udpparsernet,
+                device_ids=[
+                    args.local_rank],
+                output_device=args.local_rank,
+                broadcast_buffers=False,
+                find_unused_parameters=True
+            )
+            self.target_pose_encoder = torch.nn.parallel.DistributedDataParallel(
+                self.target_pose_encoder,
+                device_ids=[
+                    args.local_rank],
+                output_device=args.local_rank,
+                broadcast_buffers=False,
+                find_unused_parameters=True
+            )
+            self.shader = torch.nn.parallel.DistributedDataParallel(
+                self.shader,
+                device_ids=[
+                    args.local_rank],
+                output_device=args.local_rank,
+                broadcast_buffers=True
+            )
+            self.rgbadecodernet = torch.nn.parallel.DistributedDataParallel(
+                self.rgbadecodernet,
+                device_ids=[
+                    args.local_rank],
+                output_device=args.local_rank,
+                broadcast_buffers=True
+            )
+    def load_model(self, path):
+        self.udpparsernet.load_state_dict(
+            torch.load('{}/udpparsernet.pth'.format(path), map_location=device))
+        self.target_pose_encoder.load_state_dict(
+            torch.load('{}/target_pose_encoder.pth'.format(path), map_location=device))
+        self.shader.load_state_dict(
+            torch.load('{}/shader.pth'.format(path), map_location=device))
+        self.rgbadecodernet.load_state_dict(
+            torch.load('{}/rgbadecodernet.pth'.format(path), map_location=device))
+    def save_model(self, ite_num):
+        self._save_pth(self.udpparsernet,
+                       model_name="udpparsernet", ite_num=ite_num)
+        self._save_pth(self.target_pose_encoder,
+                       model_name="target_pose_encoder", ite_num=ite_num)
+        self._save_pth(self.shader,
+                       model_name="shader", ite_num=ite_num)
+        self._save_pth(self.rgbadecodernet,
+                       model_name="rgbadecodernet", ite_num=ite_num)
+    def _save_pth(self, net, model_name, ite_num):
+        args = self.args
+        to_save = None
+        if args.distributed:
+            if args.local_rank == 0:
+                to_save = net.module.state_dict()
+        else:
+            to_save = net.state_dict()
+        if to_save:
+            model_dir = os.path.join(
+                os.getcwd(), 'saved_models', args.model_name + os.sep + "checkpoints" + os.sep + "itr_%d" % (ite_num)+os.sep)
+            os.makedirs(model_dir, exist_ok=True)
+            torch.save(to_save, model_dir + model_name + ".pth")
+    def train(self):
+        self.udpparsernet.train()
+        self.target_pose_encoder.train()
+        self.shader.train()
+        self.rgbadecodernet.train()
+    def eval(self):
+        self.udpparsernet.eval()
+        self.target_pose_encoder.eval()
+        self.shader.eval()
+        self.rgbadecodernet.eval()
+    def device(self):
+        self.udpparsernet.to(device)
+        self.target_pose_encoder.to(device)
+        self.shader.to(device)
+        self.rgbadecodernet.to(device)
+    def data_norm_image(self, data):
+        with torch.cuda.amp.autocast(enabled=False):
+            for name in ["character_labels", "pose_label"]:
+                if name in data:
+                    data[name] = data[name].to(
+                        device, non_blocking=True).float()
+            for name in ["pose_images", "pose_mask", "character_images", "character_masks"]:
+                if name in data:
+                    data[name] = data[name].to(
+                        device, non_blocking=True).float() / 255.0
+            if "pose_images" in data:
+                data["num_pose_images"] = data["pose_images"].shape[1]
+                data["num_samples"] = data["pose_images"].shape[0]
+            if "character_images" in data:
+                data["num_character_images"] = data["character_images"].shape[1]
+                data["num_samples"] = data["character_images"].shape[0]
+            if "pose_images" in data and "character_images" in data:
+                assert (data["pose_images"].shape[0] ==
+                        data["character_images"].shape[0])
+        return data
+    def reset_charactersheet(self):
+        self.parser_ckpt = None
+    def model_step(self, data, training=False):
+        self.eval()
+        with torch.cuda.amp.autocast(enabled=False):
+            pred = {}
+            if self.parser_ckpt:
+                pred["parser"] = self.parser_ckpt
+            else:
+                pred = self.character_parser_forward(data, pred)
+                self.parser_ckpt = pred["parser"]
+            pred = self.pose_parser_sc_forward(data, pred)
+            pred = self.shader_pose_encoder_forward(data, pred)
+            pred = self.shader_forward(data, pred)
+        return pred
+    def shader_forward(self, data, pred={}):
+        assert ("num_character_images" in data), "ERROR: No Character Sheet input."
+        character_images_rgb_nmchw, num_character_images = data[
+            "character_images"], data["num_character_images"]
+        # build  x_reference_rgb_a_sudp in the draw call
+        shader_character_a_nmchw = data["character_masks"]
+        assert torch.any(torch.mean(shader_character_a_nmchw, (0, 2, 3, 4)) >= 0.95) == False, "ERROR: \
+                No transparent area found in the image, PLEASE separate the foreground of input character sheets.\
+                The website waifucutout.com is recommended to automatically cut out the foreground."
+        if shader_character_a_nmchw is None:
+            shader_character_a_nmchw = pred["parser"]["pred"][:, :, 3:4, :, :]
+        x_reference_rgb_a = torch.cat([shader_character_a_nmchw[:, :, :, :, :] * character_images_rgb_nmchw[:, :, :, :, :],
+                                       shader_character_a_nmchw[:,
+                                                                :, :, :, :],
+                                       ], 2)
+        assert (x_reference_rgb_a.shape[2] == self.DIM_SHADER_REFERENCE)
+        # build  x_reference_features in the draw call
+        x_reference_features = pred["parser"]["features"]
+        # run cinn shader
+        retdic = self.shader(
+            pred["shader"]["target_pose_features"], x_reference_rgb_a, x_reference_features)
+        pred["shader"].update(retdic)
+        # decode rgba
+        if True:
+            dec_out = self.rgbadecodernet(
+                retdic["y_last_remote_features"])
+            y_weighted_x_reference_RGB = dec_out[:, 0:3, :, :]
+            y_weighted_mask_A = dec_out[:, 3:4, :, :]
+        y_weighted_warp_decoded_rgba = torch.cat(
+            (y_weighted_x_reference_RGB*y_weighted_mask_A, y_weighted_mask_A), dim=1
+        )
+        assert(y_weighted_warp_decoded_rgba.shape[1] == 4)
+        assert(
+            y_weighted_warp_decoded_rgba.shape[-1] == character_images_rgb_nmchw.shape[-1])
+        # apply decoded mask to decoded rgb, finishing the draw call
+        pred["shader"]["y_weighted_warp_decoded_rgba"] = y_weighted_warp_decoded_rgba
+        return pred
+    def character_parser_forward(self, data, pred={}):
+        if not("num_character_images" in data and "character_images" in data):
+            return pred
+        pred["parser"] = {"pred": None}  # create output
+        inputs_rgb_nmchw, num_samples, num_character_images = data[
+            "character_images"],  data["num_samples"], data["num_character_images"]
+        inputs_rgb_fchw = inputs_rgb_nmchw.view(
+            (num_samples * num_character_images, inputs_rgb_nmchw.shape[2], inputs_rgb_nmchw.shape[3], inputs_rgb_nmchw.shape[4]))
+        encoder_out, features = self.udpparsernet(
+            (inputs_rgb_fchw-0.6)/0.2970)
+        pred["parser"]["features"] = [features_out.view(
+            (num_samples, num_character_images, features_out.shape[1], features_out.shape[2], features_out.shape[3])) for features_out in features]
+        if (encoder_out is not None):
+            pred["parser"]["pred"] = UDPClip(encoder_out.view(
+                (num_samples, num_character_images, encoder_out.shape[1], encoder_out.shape[2], encoder_out.shape[3])))
+        return pred
+    def pose_parser_sc_forward(self, data, pred={}):
+        if not("num_pose_images" in data and "pose_images" in data):
+            return pred
+        inputs_aug_rgb_nmchw, num_samples, num_pose_images = data[
+            "pose_images"],  data["num_samples"], data["num_pose_images"]
+        inputs_aug_rgb_fchw = inputs_aug_rgb_nmchw.view(
+            (num_samples * num_pose_images, inputs_aug_rgb_nmchw.shape[2], inputs_aug_rgb_nmchw.shape[3], inputs_aug_rgb_nmchw.shape[4]))
+        encoder_out, _ = self.udpparsernet(
+            (inputs_aug_rgb_fchw-0.6)/0.2970)
+        encoder_out = encoder_out.view(
+            (num_samples, num_pose_images, encoder_out.shape[1], encoder_out.shape[2], encoder_out.shape[3]))
+        # apply sigmoid after eval loss
+        pred["pose_parser"] = {"pred":UDPClip(encoder_out)}
+        return pred
+    def shader_pose_encoder_forward(self, data, pred={}):
+        pred["shader"] = {}  # create output
+        if "pose_images" in data:
+            pose_images_rgb_nmchw = data["pose_images"]
+            target_gt_rgb = pose_images_rgb_nmchw[:, 0, :, :, :]
+            pred["shader"]["target_gt_rgb"] = target_gt_rgb
+        shader_target_a = None
+        if "pose_mask" in data:
+            pred["shader"]["target_gt_a"] = data["pose_mask"]
+            shader_target_a = data["pose_mask"]
+        shader_target_sudp = None
+        if "pose_label" in data:
+            shader_target_sudp = data["pose_label"][:, :3, :, :]
+        if self.args.test_pose_use_parser_udp:
+            shader_target_sudp = None
+        if shader_target_sudp is None:
+            shader_target_sudp = pred["pose_parser"]["pred"][:, 0:3, :, :]
+        if shader_target_a is None:
+            shader_target_a = pred["pose_parser"]["pred"][:, 3:4, :, :]
+        # build x_target_sudp_a in the draw call
+        x_target_sudp_a = torch.cat((
+            shader_target_sudp*shader_target_a,
+            shader_target_a
+        ), 1)
+        pred["shader"].update({
+            "x_target_sudp_a": x_target_sudp_a
+        })
+        _, features = self.traget_pose_encoder(
+            (x_target_sudp_a-0.6)/0.2970, ret_parser_out=False)
+        pred["shader"]["target_pose_features"] = features
+        return pred

data_loader.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import cv2
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+import os
+cv2.setNumThreads(1)
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+class RandomResizedCropWithAutoCenteringAndZeroPadding (object):
+    def __init__(self, output_size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), center_jitter=(0.1, 0.1), size_from_alpha_mask=True):
+        assert isinstance(output_size, (int, tuple))
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            assert len(output_size) == 2
+            self.output_size = output_size
+        assert isinstance(scale,  tuple)
+        assert isinstance(ratio,  tuple)
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            raise ValueError("Scale and ratio should be of kind (min, max)")
+        self.size_from_alpha_mask = size_from_alpha_mask
+        self.scale = scale
+        self.ratio = ratio
+        assert isinstance(center_jitter,  tuple)
+        self.center_jitter = center_jitter
+    def __call__(self, sample):
+        imidx, image = sample['imidx'], sample["image_np"]
+        if "labels" in sample:
+            label = sample["labels"]
+        else:
+            label = None
+        im_h, im_w = image.shape[:2]
+        if self.size_from_alpha_mask and image.shape[2] == 4:
+            # compute bbox from alpha mask
+            bbox_left, bbox_top, bbox_w, bbox_h = cv2.boundingRect(
+                (image[:, :, 3] > 0).astype(np.uint8))
+        else:
+            bbox_left, bbox_top = 0, 0
+            bbox_h, bbox_w = image.shape[:2]
+        if bbox_h <= 1 and bbox_w <= 1:
+            sample["bad"] = 0
+        else:
+            # detect too small image here
+            alpha_varea = np.sum((image[:, :, 3] > 0).astype(np.uint8))
+            image_area = image.shape[0]*image.shape[1]
+            if alpha_varea/image_area < 0.001:
+                sample["bad"] = alpha_varea
+                # detect bad image
+        if "bad" in sample:
+            # baddata_dir = os.path.join(os.getcwd(), 'test_data', "baddata" + os.sep)
+            # save_output(str(imidx)+".png",image,label,baddata_dir)
+            bbox_h, bbox_w = image.shape[:2]
+            sample["image_np"] = np.zeros(
+                [self.output_size[0], self.output_size[1], image.shape[2]], dtype=image.dtype)
+            if label is not None:
+                sample["labels"] = np.zeros(
+                    [self.output_size[0], self.output_size[1], 4], dtype=label.dtype)
+            return sample
+        # compute default area by making sure output_size contains bbox_w * bbox_h
+        jitter_h = np.random.uniform(-bbox_h *
+                                     self.center_jitter[0], bbox_h*self.center_jitter[0])
+        jitter_w = np.random.uniform(-bbox_w *
+                                     self.center_jitter[1], bbox_w*self.center_jitter[1])
+        # h/w
+        target_aspect_ratio = np.exp(
+            np.log(self.output_size[0]/self.output_size[1]) +
+            np.random.uniform(np.log(self.ratio[0]), np.log(self.ratio[1]))
+        )
+        source_aspect_ratio = bbox_h/bbox_w
+        if target_aspect_ratio < source_aspect_ratio:
+            # same w, target has larger h, use h to align
+            target_height = bbox_h * \
+                np.random.uniform(self.scale[0], self.scale[1])
+            virtual_h = int(
+                round(target_height))
+            virtual_w = int(
+                round(target_height / target_aspect_ratio))  # h/w
+        else:
+            # same w, source has larger h, use w to align
+            target_width = bbox_w * \
+                np.random.uniform(self.scale[0], self.scale[1])
+            virtual_h = int(
+                round(target_width * target_aspect_ratio))  # h/w
+            virtual_w = int(
+                round(target_width))
+        # print("required aspect ratio:",      target_aspect_ratio)
+        virtual_top = int(round(bbox_top + jitter_h - (virtual_h-bbox_h)/2))
+        virutal_left = int(round(bbox_left + jitter_w - (virtual_w-bbox_w)/2))
+        if virtual_top < 0:
+            top_padding = abs(virtual_top)
+            crop_top = 0
+        else:
+            top_padding = 0
+            crop_top = virtual_top
+        if virutal_left < 0:
+            left_padding = abs(virutal_left)
+            crop_left = 0
+        else:
+            left_padding = 0
+            crop_left = virutal_left
+        if virtual_top+virtual_h > im_h:
+            bottom_padding = abs(im_h-(virtual_top+virtual_h))
+            crop_bottom = im_h
+        else:
+            bottom_padding = 0
+            crop_bottom = virtual_top+virtual_h
+        if virutal_left+virtual_w > im_w:
+            right_padding = abs(im_w-(virutal_left+virtual_w))
+            crop_right = im_w
+        else:
+            right_padding = 0
+            crop_right = virutal_left+virtual_w
+        # crop
+        image = image[crop_top:crop_bottom, crop_left:  crop_right]
+        if label is not None:
+            label = label[crop_top:crop_bottom, crop_left:  crop_right]
+        # pad
+        if top_padding + bottom_padding + left_padding + right_padding > 0:
+            padding = ((top_padding, bottom_padding),
+                       (left_padding, right_padding), (0, 0))
+            # print("padding", padding)
+            image = np.pad(image, padding, mode='constant')
+            if label is not None:
+                label = np.pad(label, padding, mode='constant')
+        if image.shape[0]/image.shape[1] - virtual_h/virtual_w > 0.001:
+            print("virtual aspect ratio:",  virtual_h/virtual_w)
+            print("image aspect ratio:", image.shape[0]/image.shape[1])
+        assert (image.shape[0]/image.shape[1] - virtual_h/virtual_w < 0.001)
+        sample["crop"] = np.array(
+            [im_h, im_w,  crop_top, crop_bottom, crop_left, crop_right, top_padding, bottom_padding, left_padding, right_padding, image.shape[0], image.shape[1]])
+        # resize
+        if self.output_size[1] != image.shape[1] or self.output_size[0] != image.shape[0]:
+            if self.output_size[1] > image.shape[1] and self.output_size[0] > image.shape[0]:
+                # enlarging
+                image = cv2.resize(
+                    image, (self.output_size[1], self.output_size[0]), interpolation=cv2.INTER_LINEAR)
+            else:
+                # shrinking
+                image = cv2.resize(
+                    image, (self.output_size[1], self.output_size[0]), interpolation=cv2.INTER_AREA)
+            if label is not None:
+                label = cv2.resize(label, (self.output_size[1], self.output_size[0]),
+                                   interpolation=cv2.INTER_NEAREST_EXACT)
+        assert image.shape[0] == self.output_size[0] and image.shape[1] == self.output_size[1]
+        sample['imidx'], sample["image_np"] = imidx, image
+        if label is not None:
+            assert label.shape[0] == self.output_size[0] and label.shape[1] == self.output_size[1]
+            sample["labels"] = label
+        return sample
+class FileDataset(Dataset):
+    def __init__(self, image_names_list, fg_img_lbl_transform=None, shader_pose_use_gt_udp_test=True, shader_target_use_gt_rgb_debug=False):
+        self.image_name_list = image_names_list
+        self.fg_img_lbl_transform = fg_img_lbl_transform
+        self.shader_pose_use_gt_udp_test = shader_pose_use_gt_udp_test
+        self.shader_target_use_gt_rgb_debug = shader_target_use_gt_rgb_debug
+    def __len__(self):
+        return len(self.image_name_list)
+    def get_gt_from_disk(self, idx, imname, read_label):
+        if read_label:
+            # read label
+            with open(imname, mode="rb") as bio:
+                if imname.find(".npz") > 0:
+                    label_np = np.load(bio, allow_pickle=True)[
+                        'i'].astype(np.float32, copy=False)
+                else:
+                    label_np = cv2.cvtColor(cv2.imdecode(np.frombuffer(bio.read(
+                    ), np.uint8),  cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH | cv2.IMREAD_UNCHANGED), cv2.COLOR_BGRA2RGBA)
+            assert (4 == label_np.shape[2])
+            # fake image out of valid label
+            image_np = (label_np*255).clip(0, 255).astype(np.uint8, copy=False)
+            # assemble sample
+            sample = {'imidx': np.array(
+                [idx]),   "image_np": image_np, "labels": label_np}
+        else:
+            # read image as unit8
+            with open(imname, mode="rb") as bio:
+                image_np = cv2.cvtColor(cv2.imdecode(np.frombuffer(
+                    bio.read(), np.uint8), cv2.IMREAD_UNCHANGED), cv2.COLOR_BGRA2RGBA)
+                # image_np = Image.open(bio)
+                # image_np = np.array(image_np)
+            assert (3 == len(image_np.shape))
+            if (image_np.shape[2] == 4):
+                mask_np = image_np[:, :, 3:4]
+                image_np = (image_np[:, :, :3] *
+                            (image_np[:, :, 3][:, :, np.newaxis]/255.0)).clip(0, 255).astype(np.uint8, copy=False)
+            elif (image_np.shape[2] == 3):
+                # generate a fake mask
+                # Fool-proofing
+                mask_np = np.ones(
+                    (image_np.shape[0], image_np.shape[1], 1), dtype=np.uint8)*255
+                print("WARN: transparent background is preferred for image ", imname)
+            else:
+                raise ValueError("weird shape of image ", imname, image_np)
+            image_np = np.concatenate((image_np, mask_np), axis=2)
+            sample = {'imidx': np.array(
+                [idx]),   "image_np": image_np}
+        # apply fg_img_lbl_transform
+        if self.fg_img_lbl_transform:
+            sample = self.fg_img_lbl_transform(sample)
+        if "labels" in sample:
+            # return UDP as 4chn XYZV float tensor
+            sample["labels"] = torch.from_numpy(
+                sample["labels"].transpose((2, 0, 1)))
+            assert (sample["labels"].dtype == torch.float32)
+        if "image_np" in sample:
+            # return image as 3chn RGB uint8 tensor and 1chn A uint8 tensor
+            sample["mask"] = torch.from_numpy(
+                sample["image_np"][:, :, 3:4].transpose((2, 0, 1)))
+            assert (sample["mask"].dtype == torch.uint8)
+            sample["image"] = torch.from_numpy(
+                sample["image_np"][:, :, :3].transpose((2, 0, 1)))
+            assert (sample["image"].dtype == torch.uint8)
+            del sample["image_np"]
+        return sample
+    def __getitem__(self, idx):
+        sample = {
+            'imidx': np.array([idx])}
+        target = self.get_gt_from_disk(
+            idx, imname=self.image_name_list[idx][0], read_label=self.shader_pose_use_gt_udp_test)
+        if self.shader_target_use_gt_rgb_debug:
+            sample["pose_images"] = torch.stack([target["image"]])
+            sample["pose_mask"] = target["mask"]
+        elif self.shader_pose_use_gt_udp_test:
+            sample["pose_label"] = target["labels"]
+            sample["pose_mask"] = target["mask"]
+        else:
+            sample["pose_images"] = torch.stack([target["image"]])
+        if "crop" in target:
+            sample["pose_crop"] = target["crop"]
+        character_images = []
+        character_masks = []
+        for i in range(1, len(self.image_name_list[idx])):
+            source = self.get_gt_from_disk(
+                idx, self.image_name_list[idx][i], read_label=False)
+            character_images.append(source["image"])
+            character_masks.append(source["mask"])
+        character_images = torch.stack(character_images)
+        character_masks = torch.stack(character_masks)
+        sample.update({
+            "character_images": character_images,
+            "character_masks": character_masks
+        })
+        # do not make fake labels in inference
+        return sample

infer.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+rm -r "./results"
+mkdir "./results"
+rlaunch --gpu=1 --cpu=4 --memory=25600 -- python3 -m torch.distributed.launch \
+--nproc_per_node=1 train.py --mode=test \
+--world_size=1 --dataloaders=2 \
+--test_input_poses_images=./test_data/ \
+--test_input_person_images=./character_sheet/ \
+--test_output_dir=./results/ \
+--test_checkpoint_dir=./weights/
+echo Generating video...
+ffmpeg -r 30 -y -i ./results/%d.png -r 30 -c:v libx264 output.mp4 -r 30
+echo DONE.

model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

model/backbone.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import torch
+import torch.nn as nn
+from torchvision import models
+from torch.nn import functional as F
+import torch.nn as nn
+import torch
+from torchvision import models
+class AdaptiveConcatPool2d(nn.Module):
+    """
+    Layer that concats `AdaptiveAvgPool2d` and `AdaptiveMaxPool2d`.
+    Source: Fastai. This code was taken from the fastai library at url
+    https://github.com/fastai/fastai/blob/master/fastai/layers.py#L176
+    """
+    def __init__(self, sz=None):
+        "Output will be 2*sz or 2 if sz is None"
+        super().__init__()
+        self.output_size = sz or 1
+        self.ap = nn.AdaptiveAvgPool2d(self.output_size)
+        self.mp = nn.AdaptiveMaxPool2d(self.output_size)
+    def forward(self, x): return torch.cat([self.mp(x), self.ap(x)], 1)
+class MyNorm(nn.Module):
+    def __init__(self, num_channels):
+        super(MyNorm, self).__init__()
+        self.norm = nn.InstanceNorm2d(
+            num_channels, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
+    def forward(self, x):
+        x = self.norm(x)
+        return x
+def resnet_fastai(model, pretrained,  url, replace_first_layer=None, replace_maxpool_layer=None, progress=True, map_location=None, **kwargs):
+    cut = -2
+    s = model(pretrained=False, **kwargs)
+    if replace_maxpool_layer is not None:
+        s.maxpool = replace_maxpool_layer
+    if replace_first_layer is not None:
+        body = nn.Sequential(replace_first_layer, *list(s.children())[1:cut])
+    else:
+        body = nn.Sequential(*list(s.children())[:cut])
+    if pretrained:
+        state = torch.hub.load_state_dict_from_url(url,
+                                                   progress=progress, map_location=map_location)
+        if replace_first_layer is not None:
+            for each in list(state.keys()).copy():
+                if each.find("0.0.") == 0:
+                    del state[each]
+        body_tail = nn.Sequential(body)
+        ret = body_tail.load_state_dict(state, strict=False)
+    return body
+def get_backbone(name, pretrained=True, map_location=None):
+    """ Loading backbone, defining names for skip-connections and encoder output. """
+    first_layer_for_4chn = nn.Conv2d(
+        4, 64, kernel_size=7, stride=2, padding=3, bias=False)
+    max_pool_layer_replace = nn.Conv2d(
+        64, 64, kernel_size=3, stride=2, padding=1, bias=False)
+    # loading backbone model
+    if name == 'resnet18':
+        backbone = models.resnet18(pretrained=pretrained)
+    if name == 'resnet18-4':
+        backbone = models.resnet18(pretrained=pretrained)
+        backbone.conv1 = first_layer_for_4chn
+    elif name == 'resnet34':
+        backbone = models.resnet34(pretrained=pretrained)
+    elif name == 'resnet50':
+        backbone = models.resnet50(pretrained=False, norm_layer=MyNorm)
+        backbone.maxpool = max_pool_layer_replace
+    elif name == 'resnet101':
+        backbone = models.resnet101(pretrained=pretrained)
+    elif name == 'resnet152':
+        backbone = models.resnet152(pretrained=pretrained)
+    elif name == 'vgg16':
+        backbone = models.vgg16_bn(pretrained=pretrained).features
+    elif name == 'vgg19':
+        backbone = models.vgg19_bn(pretrained=pretrained).features
+    elif name == 'resnet18_danbo-4':
+        backbone = resnet_fastai(models.resnet18, url="https://github.com/RF5/danbooru-pretrained/releases/download/v0.1/resnet18-3f77756f.pth",
+                                 pretrained=pretrained, map_location=map_location, norm_layer=MyNorm, replace_first_layer=first_layer_for_4chn)
+    elif name == 'resnet50_danbo':
+        backbone = resnet_fastai(models.resnet50, url="https://github.com/RF5/danbooru-pretrained/releases/download/v0.1/resnet50-13306192.pth",
+                                 pretrained=pretrained, map_location=map_location, norm_layer=MyNorm, replace_maxpool_layer=max_pool_layer_replace)
+    elif name == 'densenet121':
+        backbone = models.densenet121(pretrained=True).features
+    elif name == 'densenet161':
+        backbone = models.densenet161(pretrained=True).features
+    elif name == 'densenet169':
+        backbone = models.densenet169(pretrained=True).features
+    elif name == 'densenet201':
+        backbone = models.densenet201(pretrained=True).features
+    else:
+        raise NotImplemented(
+            '{} backbone model is not implemented so far.'.format(name))
+    #print(backbone)
+    # specifying skip feature and output names
+    if name.startswith('resnet'):
+        feature_names = [None, 'relu', 'layer1', 'layer2', 'layer3']
+        backbone_output = 'layer4'
+    elif name == 'vgg16':
+        # TODO: consider using a 'bridge' for VGG models, there is just a MaxPool between last skip and backbone output
+        feature_names = ['5', '12', '22', '32', '42']
+        backbone_output = '43'
+    elif name == 'vgg19':
+        feature_names = ['5', '12', '25', '38', '51']
+        backbone_output = '52'
+    elif name.startswith('densenet'):
+        feature_names = [None, 'relu0', 'denseblock1',
+                         'denseblock2', 'denseblock3']
+        backbone_output = 'denseblock4'
+    elif name == 'unet_encoder':
+        feature_names = ['module1', 'module2', 'module3', 'module4']
+        backbone_output = 'module5'
+    else:
+        raise NotImplemented(
+            '{} backbone model is not implemented so far.'.format(name))
+    if name.find('_danbo') > 0:
+        feature_names = [None, '2', '4', '5', '6']
+        backbone_output = '7'
+    return backbone, feature_names, backbone_output
+class UpsampleBlock(nn.Module):
+    # TODO: separate parametric and non-parametric classes?
+    # TODO: skip connection concatenated OR added
+    def __init__(self, ch_in, ch_out=None, skip_in=0, use_bn=True, parametric=False):
+        super(UpsampleBlock, self).__init__()
+        self.parametric = parametric
+        ch_out = ch_in/2 if ch_out is None else ch_out
+        # first convolution: either transposed conv, or conv following the skip connection
+        if parametric:
+            # versions: kernel=4 padding=1, kernel=2 padding=0
+            self.up = nn.ConvTranspose2d(in_channels=ch_in, out_channels=ch_out, kernel_size=(4, 4),
+                                         stride=2, padding=1, output_padding=0, bias=(not use_bn))
+            self.bn1 = MyNorm(ch_out) if use_bn else None
+        else:
+            self.up = None
+            ch_in = ch_in + skip_in
+            self.conv1 = nn.Conv2d(in_channels=ch_in, out_channels=ch_out, kernel_size=(3, 3),
+                                   stride=1, padding=1, bias=(not use_bn))
+            self.bn1 = MyNorm(ch_out) if use_bn else None
+        self.relu = nn.ReLU(inplace=True)
+        # second convolution
+        conv2_in = ch_out if not parametric else ch_out + skip_in
+        self.conv2 = nn.Conv2d(in_channels=conv2_in, out_channels=ch_out, kernel_size=(3, 3),
+                               stride=1, padding=1, bias=(not use_bn))
+        self.bn2 = MyNorm(ch_out) if use_bn else None
+    def forward(self, x, skip_connection=None):
+        x = self.up(x) if self.parametric else F.interpolate(x, size=None, scale_factor=2, mode='bilinear',
+                                                             align_corners=None)
+        if self.parametric:
+            x = self.bn1(x) if self.bn1 is not None else x
+            x = self.relu(x)
+        if skip_connection is not None:
+            x = torch.cat([x, skip_connection], dim=1)
+        if not self.parametric:
+            x = self.conv1(x)
+            x = self.bn1(x) if self.bn1 is not None else x
+            x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x) if self.bn2 is not None else x
+        x = self.relu(x)
+        return x
+class ResEncUnet(nn.Module):
+    """ U-Net (https://arxiv.org/pdf/1505.04597.pdf) implementation with pre-trained torchvision backbones."""
+    def __init__(self,
+                 backbone_name,
+                 pretrained=True,
+                 encoder_freeze=False,
+                 classes=21,
+                 decoder_filters=(512, 256, 128, 64, 32),
+                 parametric_upsampling=True,
+                 shortcut_features='default',
+                 decoder_use_instancenorm=True,
+                 map_location=None
+                 ):
+        super(ResEncUnet, self).__init__()
+        self.backbone_name = backbone_name
+        self.backbone, self.shortcut_features, self.bb_out_name = get_backbone(
+            backbone_name, pretrained=pretrained, map_location=map_location)
+        shortcut_chs, bb_out_chs = self.infer_skip_channels()
+        if shortcut_features != 'default':
+            self.shortcut_features = shortcut_features
+        # build decoder part
+        self.upsample_blocks = nn.ModuleList()
+        # avoiding having more blocks than skip connections
+        decoder_filters = decoder_filters[:len(self.shortcut_features)]
+        decoder_filters_in = [bb_out_chs] + list(decoder_filters[:-1])
+        num_blocks = len(self.shortcut_features)
+        for i, [filters_in, filters_out] in enumerate(zip(decoder_filters_in, decoder_filters)):
+            self.upsample_blocks.append(UpsampleBlock(filters_in, filters_out,
+                                                      skip_in=shortcut_chs[num_blocks-i-1],
+                                                      parametric=parametric_upsampling,
+                                                      use_bn=decoder_use_instancenorm))
+        self.final_conv = nn.Conv2d(
+            decoder_filters[-1], classes, kernel_size=(1, 1))
+        if encoder_freeze:
+            self.freeze_encoder()
+    def freeze_encoder(self):
+        """ Freezing encoder parameters, the newly initialized decoder parameters are remaining trainable. """
+        for param in self.backbone.parameters():
+            param.requires_grad = False
+    def forward(self, *input, ret_parser_out=True):
+        """ Forward propagation in U-Net. """
+        x, features = self.forward_backbone(*input)
+        output_feature = [x]
+        for skip_name, upsample_block in zip(self.shortcut_features[::-1], self.upsample_blocks):
+            skip_features = features[skip_name]
+            if skip_features is not None:
+                output_feature.append(skip_features)
+            if ret_parser_out:
+                x = upsample_block(x, skip_features)
+        if ret_parser_out:
+            x = self.final_conv(x)
+            # apply sigmoid later
+        else:
+            x = None
+        return x, output_feature
+    def forward_backbone(self, x):
+        """ Forward propagation in backbone encoder network.  """
+        features = {None: None} if None in self.shortcut_features else dict()
+        for name, child in self.backbone.named_children():
+            x = child(x)
+            if name in self.shortcut_features:
+                features[name] = x
+            if name == self.bb_out_name:
+                break
+        return x, features
+    def infer_skip_channels(self):
+        """ Getting the number of channels at skip connections and at the output of the encoder. """
+        if self.backbone_name.find("-4") > 0:
+            x = torch.zeros(1, 4, 224, 224)
+        else:
+            x = torch.zeros(1, 3, 224, 224)
+        has_fullres_features = self.backbone_name.startswith(
+            'vgg') or self.backbone_name == 'unet_encoder'
+        # only VGG has features at full resolution
+        channels = [] if has_fullres_features else [0]
+        # forward run in backbone to count channels (dirty solution but works for *any* Module)
+        for name, child in self.backbone.named_children():
+            x = child(x)
+            if name in self.shortcut_features:
+                channels.append(x.shape[1])
+            if name == self.bb_out_name:
+                out_channels = x.shape[1]
+                break
+        return channels, out_channels

model/decoder_small.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from torch import nn
+import torch.nn.functional as F
+import torch
+class ResBlock2d(nn.Module):
+    def __init__(self, in_features, kernel_size, padding):
+        super(ResBlock2d, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size,
+                               padding=padding)
+        self.conv2 = nn.Conv2d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size,
+                               padding=padding)
+        self.norm1 = nn.Conv2d(
+            in_channels=in_features, out_channels=in_features, kernel_size=1)
+        self.norm2 = nn.Conv2d(
+            in_channels=in_features, out_channels=in_features, kernel_size=1)
+    def forward(self, x):
+        out = self.norm1(x)
+        out = F.relu(out, inplace=True)
+        out = self.conv1(out)
+        out = self.norm2(out)
+        out = F.relu(out, inplace=True)
+        out = self.conv2(out)
+        out += x
+        return out
+class RGBADecoderNet(nn.Module):
+    def __init__(self,  c=64, out_planes=4,  num_bottleneck_blocks=1):
+        super(RGBADecoderNet, self).__init__()
+        self.conv_rgba = nn.Sequential(nn.Conv2d(c, out_planes, kernel_size=3, stride=1,
+                                                 padding=1, dilation=1, bias=True))
+        self.bottleneck = torch.nn.Sequential()
+        for i in range(num_bottleneck_blocks):
+            self.bottleneck.add_module(
+                'r' + str(i), ResBlock2d(c, kernel_size=(3, 3), padding=(1, 1)))
+    def forward(self, features_weighted_mask_atfeaturesscale_list=[]):
+        return torch.sigmoid(self.conv_rgba(self.bottleneck(features_weighted_mask_atfeaturesscale_list.pop(0))))

model/shader.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .warplayer import warp_features
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class DecoderBlock(nn.Module):
+    def __init__(self, in_planes, c=224, out_msgs=0, out_locals=0, block_nums=1, out_masks=1, out_local_flows=32, out_msgs_flows=32, out_feat_flows=0):
+        super(DecoderBlock, self).__init__()
+        self.conv0 = nn.Sequential(
+            nn.Conv2d(in_planes, c, 3, 2, 1),
+            nn.PReLU(c),
+            nn.Conv2d(c, c, 3, 2, 1),
+            nn.PReLU(c),
+        )
+        self.convblocks = nn.ModuleList()
+        for i in range(block_nums):
+            self.convblocks.append(nn.Sequential(
+                nn.Conv2d(c, c, 3, 1, 1),
+                nn.PReLU(c),
+                nn.Conv2d(c, c, 3, 1, 1),
+                nn.PReLU(c),
+                nn.Conv2d(c, c, 3, 1, 1),
+                nn.PReLU(c),
+                nn.Conv2d(c, c, 3, 1, 1),
+                nn.PReLU(c),
+                nn.Conv2d(c, c, 3, 1, 1),
+                nn.PReLU(c),
+                nn.Conv2d(c, c, 3, 1, 1),
+                nn.PReLU(c),
+            ))
+        self.out_flows = 2
+        self.out_msgs = out_msgs
+        self.out_msgs_flows = out_msgs_flows if out_msgs > 0 else 0
+        self.out_locals = out_locals
+        self.out_local_flows = out_local_flows if out_locals > 0 else 0
+        self.out_masks = out_masks
+        self.out_feat_flows = out_feat_flows
+        self.conv_last = nn.Sequential(
+            nn.ConvTranspose2d(c, c, 4, 2, 1),
+            nn.PReLU(c),
+            nn.ConvTranspose2d(c, self.out_flows+self.out_msgs+self.out_msgs_flows +
+                               self.out_locals+self.out_local_flows+self.out_masks+self.out_feat_flows, 4, 2, 1),
+        )
+    def forward(self, accumulated_flow, *other):
+        x = [accumulated_flow]
+        for each in other:
+            if each is not None:
+                assert(accumulated_flow.shape[-1] == each.shape[-1]), "decoder want {}, but get {}".format(
+                    accumulated_flow.shape, each.shape)
+                x.append(each)
+        feat = self.conv0(torch.cat(x, dim=1))
+        for convblock1 in self.convblocks:
+            feat = convblock1(feat) + feat
+        feat = self.conv_last(feat)
+        prev = 0
+        flow = feat[:, prev:prev+self.out_flows, :, :]
+        prev += self.out_flows
+        message = feat[:, prev:prev+self.out_msgs,
+                       :, :] if self.out_msgs > 0 else None
+        prev += self.out_msgs
+        message_flow = feat[:, prev:prev + self.out_msgs_flows,
+                            :, :] if self.out_msgs_flows > 0 else None
+        prev += self.out_msgs_flows
+        local_message = feat[:, prev:prev + self.out_locals,
+                             :, :] if self.out_locals > 0 else None
+        prev += self.out_locals
+        local_message_flow = feat[:, prev:prev+self.out_local_flows,
+                                  :, :] if self.out_local_flows > 0 else None
+        prev += self.out_local_flows
+        mask = torch.sigmoid(
+            feat[:, prev:prev+self.out_masks, :, :]) if self.out_masks > 0 else None
+        prev += self.out_masks
+        feat_flow = feat[:, prev:prev+self.out_feat_flows,
+                         :, :] if self.out_feat_flows > 0 else None
+        prev += self.out_feat_flows
+        return flow, mask, message, message_flow, local_message, local_message_flow, feat_flow
+class CINN(nn.Module):
+    def __init__(self, DIM_SHADER_REFERENCE, target_feature_chns=[512, 256, 128, 64, 64], feature_chns=[2048, 1024, 512, 256, 64], out_msgs_chn=[2048, 1024, 512, 256, 64, 64], out_locals_chn=[2048, 1024, 512, 256, 64, 0], block_num=[1, 1, 1, 1, 1, 2], block_chn_num=[224, 224, 224, 224, 224, 224]):
+        super(CINN, self).__init__()
+        self.in_msgs_chn = [0, *out_msgs_chn[:-1]]
+        self.in_locals_chn = [0, *out_locals_chn[:-1]]
+        self.decoder_blocks = nn.ModuleList()
+        self.feed_weighted = True
+        if self.feed_weighted:
+            in_planes = 2+2+DIM_SHADER_REFERENCE*2
+        else:
+            in_planes = 2+DIM_SHADER_REFERENCE
+        for each_target_feature_chns, each_feature_chns, each_out_msgs_chn, each_out_locals_chn, each_in_msgs_chn, each_in_locals_chn, each_block_num, each_block_chn_num in zip(target_feature_chns, feature_chns, out_msgs_chn, out_locals_chn, self.in_msgs_chn, self.in_locals_chn, block_num, block_chn_num):
+            self.decoder_blocks.append(
+                DecoderBlock(in_planes+each_target_feature_chns+each_feature_chns+each_in_locals_chn+each_in_msgs_chn, c=each_block_chn_num, block_nums=each_block_num, out_msgs=each_out_msgs_chn, out_locals=each_out_locals_chn, out_masks=2+each_out_locals_chn))
+        for i in range(len(feature_chns), len(out_locals_chn)):
+            #print("append extra block", i, "msg",
+            #      out_msgs_chn[i], "local", out_locals_chn[i], "block", block_num[i])
+            self.decoder_blocks.append(
+                DecoderBlock(in_planes+self.in_msgs_chn[i]+self.in_locals_chn[i], c=block_chn_num[i], block_nums=block_num[i], out_msgs=out_msgs_chn[i], out_locals=out_locals_chn[i], out_masks=2+out_msgs_chn[i], out_feat_flows=0))
+    def apply_flow(self, mask, message, message_flow, local_message, local_message_flow, x_reference, accumulated_flow, each_x_reference_features=None, each_x_reference_features_flow=None):
+        if each_x_reference_features is not None:
+            size_from = each_x_reference_features
+        else:
+            size_from = x_reference
+        f_size = (size_from.shape[2], size_from.shape[3])
+        accumulated_flow = self.flow_rescale(
+            accumulated_flow, size_from)
+        # mask = warp_features(F.interpolate(
+        #    mask, size=f_size, mode="bilinear"), accumulated_flow) if mask is not None else None
+        mask = F.interpolate(
+            mask, size=f_size, mode="bilinear") if mask is not None else None
+        message = F.interpolate(
+            message, size=f_size, mode="bilinear") if message is not None else None
+        message_flow = self.flow_rescale(
+            message_flow, size_from) if message_flow is not None else None
+        message = warp_features(
+            message, message_flow) if message_flow is not None else message
+        local_message = F.interpolate(
+            local_message, size=f_size, mode="bilinear") if local_message is not None else None
+        local_message_flow = self.flow_rescale(
+            local_message_flow, size_from) if local_message_flow is not None else None
+        local_message = warp_features(
+            local_message, local_message_flow) if local_message_flow is not None else local_message
+        warp_x_reference = warp_features(F.interpolate(
+            x_reference, size=f_size, mode="bilinear"), accumulated_flow)
+        each_x_reference_features_flow = self.flow_rescale(
+            each_x_reference_features_flow, size_from) if (each_x_reference_features is not None and each_x_reference_features_flow is not None) else None
+        warp_each_x_reference_features = warp_features(
+            each_x_reference_features, each_x_reference_features_flow) if each_x_reference_features_flow is not None else each_x_reference_features
+        return mask, message, local_message, warp_x_reference, accumulated_flow, warp_each_x_reference_features, each_x_reference_features_flow
+    def forward(self, x_target_features=[], x_reference=None, x_reference_features=[]):
+        y_flow = []
+        y_feat_flow = []
+        y_local_message = []
+        y_warp_x_reference = []
+        y_warp_x_reference_features = []
+        y_weighted_flow = []
+        y_weighted_mask = []
+        y_weighted_message = []
+        y_weighted_x_reference = []
+        y_weighted_x_reference_features = []
+        for pyrlevel, ifblock in enumerate(self.decoder_blocks):
+            stacked_wref = []
+            stacked_feat = []
+            stacked_anci = []
+            stacked_flow = []
+            stacked_mask = []
+            stacked_mesg = []
+            stacked_locm = []
+            stacked_feat_flow = []
+            for view_id in range(x_reference.shape[1]):  # NMCHW
+                if pyrlevel == 0:
+                    # create from zero flow
+                    feat_ev = x_reference_features[pyrlevel][:,
+                                                             view_id, :, :, :] if pyrlevel < len(x_reference_features) else None
+                    accumulated_flow = torch.zeros_like(
+                        feat_ev[:, :2, :, :]).to(device)
+                    accumulated_feat_flow = torch.zeros_like(
+                        feat_ev[:, :32, :, :]).to(device)
+                    # domestic inputs
+                    warp_x_reference = F.interpolate(x_reference[:, view_id, :, :, :], size=(
+                        feat_ev.shape[-2], feat_ev.shape[-1]), mode="bilinear")
+                    warp_x_reference_features = feat_ev
+                    local_message = None
+                    # federated inputs
+                    weighted_flow = accumulated_flow if self.feed_weighted else None
+                    weighted_wref = warp_x_reference if self.feed_weighted else None
+                    weighted_message = None
+                else:
+                    # resume from last layer
+                    accumulated_flow = y_flow[-1][:, view_id, :, :, :]
+                    accumulated_feat_flow = y_feat_flow[-1][:,
+                                                            view_id, :, :, :] if y_feat_flow[-1] is not None else None
+                    # domestic inputs
+                    warp_x_reference = y_warp_x_reference[-1][:,
+                                                              view_id, :, :, :]
+                    warp_x_reference_features = y_warp_x_reference_features[-1][:,
+                                                                                view_id, :, :, :] if y_warp_x_reference_features[-1] is not None else None
+                    local_message = y_local_message[-1][:, view_id, :,
+                                                        :, :] if len(y_local_message) > 0 else None
+                    # federated inputs
+                    weighted_flow = y_weighted_flow[-1] if self.feed_weighted else None
+                    weighted_wref = y_weighted_x_reference[-1] if self.feed_weighted else None
+                    weighted_message = y_weighted_message[-1] if len(
+                        y_weighted_message) > 0 else None
+                scaled_x_target = x_target_features[pyrlevel][:, :, :, :].detach() if pyrlevel < len(
+                    x_target_features) else None
+                # compute flow
+                residual_flow, mask, message, message_flow, local_message, local_message_flow, residual_feat_flow = ifblock(
+                    accumulated_flow, scaled_x_target, warp_x_reference, warp_x_reference_features, weighted_flow, weighted_wref, weighted_message, local_message)
+                accumulated_flow = residual_flow + accumulated_flow
+                accumulated_feat_flow = accumulated_flow
+                feat_ev = x_reference_features[pyrlevel+1][:,
+                                                           view_id, :, :, :] if pyrlevel+1 < len(x_reference_features) else None
+                mask, message, local_message, warp_x_reference, accumulated_flow,  warp_x_reference_features, accumulated_feat_flow = self.apply_flow(
+                    mask, message, message_flow, local_message, local_message_flow, x_reference[:, view_id, :, :, :], accumulated_flow, feat_ev, accumulated_feat_flow)
+                stacked_flow.append(accumulated_flow)
+                if accumulated_feat_flow is not None:
+                    stacked_feat_flow.append(accumulated_feat_flow)
+                stacked_mask.append(mask)
+                if message is not None:
+                    stacked_mesg.append(message)
+                if local_message is not None:
+                    stacked_locm.append(local_message)
+                stacked_wref.append(warp_x_reference)
+                if warp_x_reference_features is not None:
+                    stacked_feat.append(warp_x_reference_features)
+            stacked_flow = torch.stack(stacked_flow, dim=1)  # M*NCHW -> NMCHW
+            stacked_feat_flow = torch.stack(stacked_feat_flow, dim=1) if len(
+                stacked_feat_flow) > 0 else None
+            stacked_mask = torch.stack(
+                stacked_mask, dim=1)
+            stacked_mesg = torch.stack(stacked_mesg, dim=1) if len(
+                stacked_mesg) > 0 else None
+            stacked_locm = torch.stack(stacked_locm, dim=1) if len(
+                stacked_locm) > 0 else None
+            stacked_wref = torch.stack(stacked_wref, dim=1)
+            stacked_feat = torch.stack(stacked_feat, dim=1) if len(
+                stacked_feat) > 0 else None
+            stacked_anci = torch.stack(stacked_anci, dim=1) if len(
+                stacked_anci) > 0 else None
+            y_flow.append(stacked_flow)
+            y_feat_flow.append(stacked_feat_flow)
+            y_warp_x_reference.append(stacked_wref)
+            y_warp_x_reference_features.append(stacked_feat)
+            # compute normalized confidence
+            stacked_contrib = torch.nn.functional.softmax(stacked_mask, dim=1)
+            # torch.sum to remove temp dimension M from NMCHW --> NCHW
+            weighted_flow = torch.sum(
+                stacked_mask[:, :, 0:1, :, :] * stacked_contrib[:, :, 0:1, :, :] * stacked_flow, dim=1)
+            weighted_mask = torch.sum(
+                stacked_contrib[:, :, 0:1, :, :] * stacked_mask[:, :, 0:1, :, :], dim=1)
+            weighted_wref = torch.sum(
+                stacked_mask[:, :, 0:1, :, :] * stacked_contrib[:, :, 0:1, :, :] * stacked_wref, dim=1) if stacked_wref is not None else None
+            weighted_feat = torch.sum(
+                stacked_mask[:, :, 1:2, :, :] * stacked_contrib[:, :, 1:2, :, :] * stacked_feat, dim=1) if stacked_feat is not None else None
+            weighted_mesg = torch.sum(
+                stacked_mask[:, :, 2:, :, :] * stacked_contrib[:, :, 2:, :, :] * stacked_mesg, dim=1) if stacked_mesg is not None else None
+            y_weighted_flow.append(weighted_flow)
+            y_weighted_mask.append(weighted_mask)
+            if weighted_mesg is not None:
+                y_weighted_message.append(weighted_mesg)
+            if stacked_locm is not None:
+                y_local_message.append(stacked_locm)
+            y_weighted_message.append(weighted_mesg)
+            y_weighted_x_reference.append(weighted_wref)
+            y_weighted_x_reference_features.append(weighted_feat)
+            if weighted_feat is not None:
+                y_weighted_x_reference_features.append(weighted_feat)
+        return {
+            "y_last_remote_features": [weighted_mesg],
+        }
+    def flow_rescale(self, prev_flow, each_x_reference_features):
+        if prev_flow is None:
+            prev_flow = torch.zeros_like(
+                each_x_reference_features[:, :2]).to(device)
+        else:
+            up_scale_factor = each_x_reference_features.shape[-1] / \
+                prev_flow.shape[-1]
+            if up_scale_factor != 1:
+                prev_flow = F.interpolate(prev_flow, scale_factor=up_scale_factor, mode="bilinear",
+                                          align_corners=False, recompute_scale_factor=False) * up_scale_factor
+        return prev_flow

model/warplayer.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import numpy as np
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+backwarp_tenGrid = {}
+def warp(tenInput, tenFlow):
+    with torch.cuda.amp.autocast(enabled=False):
+        k = (str(tenFlow.device), str(tenFlow.size()))
+        if k not in backwarp_tenGrid:
+            tenHorizontal = torch.linspace(-1.0, 1.0, tenFlow.shape[3], device=device).view(
+                1, 1, 1, tenFlow.shape[3]).expand(tenFlow.shape[0], -1, tenFlow.shape[2], -1)
+            tenVertical = torch.linspace(-1.0, 1.0, tenFlow.shape[2], device=device).view(
+                1, 1, tenFlow.shape[2], 1).expand(tenFlow.shape[0], -1, -1, tenFlow.shape[3])
+            backwarp_tenGrid[k] = torch.cat(
+                [tenHorizontal, tenVertical], 1).to(device)
+        tenFlow = torch.cat([tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
+                            tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0)], 1)
+        g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1)
+        if tenInput.dtype != g.dtype:
+            g = g.to(tenInput.dtype)
+        return torch.nn.functional.grid_sample(input=tenInput, grid=g, mode='bilinear', padding_mode='border', align_corners=True)
+# "zeros" "border"
+def warp_features(inp, flow, ):
+    groups = flow.shape[1]//2  # NCHW
+    samples = inp.shape[0]
+    h = inp.shape[2]
+    w = inp.shape[3]
+    assert(flow.shape[0] == samples and flow.shape[2]
+           == h and flow.shape[3] == w)
+    chns = inp.shape[1]
+    chns_per_group = chns // groups
+    assert(flow.shape[1] % 2 == 0)
+    assert(chns % groups == 0)
+    inp = inp.contiguous().view(samples*groups, chns_per_group, h, w)
+    flow = flow.contiguous().view(samples*groups, 2,  h, w)
+    feat = warp(inp, flow)
+    feat = feat.view(samples, chns, h, w)
+    return feat
+def flow2rgb(flow_map_np):
+    h, w, _ = flow_map_np.shape
+    rgb_map = np.ones((h, w, 3)).astype(np.float32)/2.0
+    normalized_flow_map = np.concatenate(
+        (flow_map_np[:, :, 0:1]/h/2.0, flow_map_np[:, :, 1:2]/w/2.0), axis=2)
+    rgb_map[:, :, 0] += normalized_flow_map[:, :, 0]
+    rgb_map[:, :, 1] -= 0.5 * \
+        (normalized_flow_map[:, :, 0] + normalized_flow_map[:, :, 1])
+    rgb_map[:, :, 2] += normalized_flow_map[:, :, 1]
+    return (rgb_map.clip(0, 1)*255.0).astype(np.uint8)

streamlit.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import cv2
+import numpy as np
+import streamlit as st
+import os
+import base64
+st.set_page_config(layout="wide", page_title='CoNR demo', page_icon="🪐")
+st.title('CoNR demo')
+st.markdown(""" <style>
+            #MainMenu {visibility: hidden;}
+            footer {visibility: hidden;}
+            </style> """, unsafe_allow_html=True)
+def get_base64(bin_file):
+    with open(bin_file, 'rb') as f:
+        data = f.read()
+    return base64.b64encode(data).decode()
+# def set_background(png_file):
+#     bin_str = get_base64(png_file)
+#     page_bg_img = '''
+#     <style>
+#     .stApp {
+#     background-image: url("data:image/png;base64,%s");
+#     background-size: 1920px 1080px;
+#     background-attachment:fixed;
+#     background-position:center;
+#     background-repeat:no-repeat;
+#     }
+#     </style>
+#     ''' % bin_str
+#     st.markdown(page_bg_img, unsafe_allow_html=True)
+# set_background('ipad_bg.png')
+upload_img = (st.file_uploader("输入character sheet", "png", accept_multiple_files=True))
+if st.button('RUN!'):
+    if upload_img is not None:
+        for i in range(len(upload_img)):
+            with open('character_sheet/{}.png'.format(i), 'wb') as f:
+                f.write(upload_img[i].read())
+        st.info('努力推理中...')
+        os.system('sh infer.sh')
+        st.info('Done!')
+        video_file=open('output.mp4', 'rb')
+        video_bytes = video_file.read()
+        st.video(video_bytes, start_time=0)
+    else:
+        st.info('还没上传图片呢> <')

train.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import argparse
+import os
+import time
+from datetime import datetime
+from distutils.util import strtobool
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from torchvision import transforms
+from data_loader import (FileDataset,
+                         RandomResizedCropWithAutoCenteringAndZeroPadding)
+from torch.utils.data.distributed import DistributedSampler
+from conr import CoNR
+def data_sampler(dataset, shuffle, distributed):
+    if distributed:
+        return torch.utils.data.distributed.DistributedSampler(dataset, shuffle=shuffle)
+    if shuffle:
+        return torch.utils.data.RandomSampler(dataset)
+    else:
+        return torch.utils.data.SequentialSampler(dataset)
+def save_output(image_name, inputs_v, d_dir=".", crop=None):
+    import cv2
+    inputs_v = inputs_v.detach().squeeze()
+    input_np = torch.clamp(inputs_v*255, 0, 255).byte().cpu().numpy().transpose(
+        (1, 2, 0))
+    # cv2.setNumThreads(1)
+    out_render_scale = cv2.cvtColor(input_np, cv2.COLOR_RGBA2BGRA)
+    if crop is not None:
+        crop = crop.cpu().numpy()[0]
+        output_img = np.zeros((crop[0], crop[1], 4), dtype=np.uint8)
+        before_resize_scale = cv2.resize(
+            out_render_scale, (crop[5]-crop[4]+crop[8]+crop[9], crop[3]-crop[2]+crop[6]+crop[7]), interpolation=cv2.INTER_AREA)  # w,h
+        output_img[crop[2]:crop[3], crop[4]:crop[5]] = before_resize_scale[crop[6]:before_resize_scale.shape[0] -
+                                                                           crop[7], crop[8]:before_resize_scale.shape[1]-crop[9]]
+    else:
+        output_img = out_render_scale
+    cv2.imwrite(d_dir+"/"+image_name.split(os.sep)[-1]+'.png',
+                output_img
+                )
+def test():
+    source_names_list = []
+    for name in sorted(os.listdir(args.test_input_person_images)):
+        thissource = os.path.join(args.test_input_person_images, name)
+        if os.path.isfile(thissource):
+            source_names_list.append(thissource)
+        if os.path.isdir(thissource):
+            print("skipping empty folder :"+thissource)
+    image_names_list = []
+    for name in sorted(os.listdir(args.test_input_poses_images)):
+        thistarget = os.path.join(args.test_input_poses_images, name)
+        if os.path.isfile(thistarget):
+            image_names_list.append([thistarget, *source_names_list])
+        if os.path.isdir(thistarget):
+            print("skipping folder :"+thistarget)
+    print(image_names_list)
+    print("---building models")
+    conrmodel = CoNR(args)
+    conrmodel.load_model(path=args.test_checkpoint_dir)
+    conrmodel.dist()
+    infer(args, conrmodel, image_names_list)
+# def test():
+#     source_names_list = []
+#     for name in os.listdir(args.test_input_person_images):
+#         thissource = os.path.join(args.test_input_person_images, name)
+#         if os.path.isfile(thissource):
+#             source_names_list.append([thissource])
+#         if os.path.isdir(thissource):
+#             toadd = [os.path.join(thissource, this_file)
+#                      for this_file in os.listdir(thissource)]
+#             if (toadd != []):
+#                 source_names_list.append(toadd)
+#             else:
+#                 print("skipping empty folder :"+thissource)
+#     image_names_list = []
+#     for eachlist in source_names_list:
+#         for name in sorted(os.listdir(args.test_input_poses_images)):
+#             thistarget = os.path.join(args.test_input_poses_images, name)
+#             if os.path.isfile(thistarget):
+#                 image_names_list.append([thistarget, *eachlist])
+#             if os.path.isdir(thistarget):
+#                 print("skipping folder :"+thistarget)
+#     print(image_names_list)
+#     print("---building models...")
+#     conrmodel = CoNR(args)
+#     conrmodel.load_model(path=args.test_checkpoint_dir)
+#     conrmodel.dist()
+#     infer(args, conrmodel, image_names_list)
+def infer(args, humanflowmodel, image_names_list):
+    print("---test images: ", len(image_names_list))
+    test_salobj_dataset = FileDataset(image_names_list=image_names_list,
+                                      fg_img_lbl_transform=transforms.Compose([
+                                          RandomResizedCropWithAutoCenteringAndZeroPadding(
+                                              (args.dataloader_imgsize, args.dataloader_imgsize), scale=(1, 1), ratio=(1.0, 1.0), center_jitter=(0.0, 0.0)
+                                          )]),
+                                      shader_pose_use_gt_udp_test=not args.test_pose_use_parser_udp,
+                                      shader_target_use_gt_rgb_debug=False
+                                      )
+    sampler = data_sampler(test_salobj_dataset, shuffle=False,
+                           distributed=args.distributed)
+    train_data = DataLoader(test_salobj_dataset,
+                            batch_size=1,
+                            shuffle=False,sampler=sampler,
+                            num_workers=args.dataloaders)
+    # start testing
+    train_num = train_data.__len__()
+    time_stamp = time.time()
+    prev_frame_rgb = []
+    prev_frame_a = []
+    for i, data in enumerate(train_data):
+        data_time_interval = time.time() - time_stamp
+        time_stamp = time.time()
+        with torch.no_grad():
+            data["character_images"] = torch.cat(
+                [data["character_images"], *prev_frame_rgb], dim=1)
+            data["character_masks"] = torch.cat(
+                [data["character_masks"], *prev_frame_a], dim=1)
+            data = humanflowmodel.data_norm_image(data)
+            pred = humanflowmodel.model_step(data, training=False)
+            # remember to call  humanflowmodel.reset_charactersheet() if you change character .
+        train_time_interval = time.time() - time_stamp
+        time_stamp = time.time()
+        if i % 5 == 0 and args.local_rank == 0:
+            print("[infer batch: %4d/%4d] time:%2f+%2f" % (
+                i, train_num,
+                data_time_interval, train_time_interval
+            ))
+        with torch.no_grad():
+            if args.test_output_video:
+                pred_img = pred["shader"]["y_weighted_warp_decoded_rgba"]
+                save_output(
+                    str(int(data["imidx"].cpu().item())), pred_img, args.test_output_dir, crop=data["pose_crop"])
+            if args.test_output_udp:
+                pred_img = pred["shader"]["x_target_sudp_a"]
+                save_output(
+                    "udp_"+str(int(data["imidx"].cpu().item())), pred_img, args.test_output_dir)
+def build_args():
+    parser = argparse.ArgumentParser()
+    # distributed learning settings
+    parser.add_argument("--world_size", type=int, default=1,
+                        help='world size')
+    parser.add_argument("--local_rank", type=int, default=0,
+                        help='local_rank, DON\'T change it')
+    # model settings
+    parser.add_argument('--dataloader_imgsize', type=int, default=256,
+                        help='Input image size of the model')
+    parser.add_argument('--batch_size', type=int, default=4,
+                        help='minibatch size')
+    parser.add_argument('--model_name', default='model_result',
+                        help='Name of the experiment')
+    parser.add_argument('--dataloaders', type=int, default=2,
+                        help='Num of dataloaders')
+    parser.add_argument('--mode', default="test", choices=['train', 'test'],
+                        help='Training mode or Testing mode')
+    # i/o settings
+    parser.add_argument('--test_input_person_images',
+                        type=str, default="./character_sheet/",
+                        help='Directory to input character sheets')
+    parser.add_argument('--test_input_poses_images', type=str,
+                        default="./test_data/",
+                        help='Directory to input UDP sequences or pose images')
+    parser.add_argument('--test_checkpoint_dir', type=str,
+                        default='./weights/',
+                        help='Directory to model weights')
+    parser.add_argument('--test_output_dir', type=str,
+                        default="./results/",
+                        help='Directory to output images')
+    # output content settings
+    parser.add_argument('--test_output_video', type=strtobool, default=True,
+                        help='Whether to output the final result of CoNR, \
+                              images will be output to test_output_dir while True.')
+    parser.add_argument('--test_output_udp', type=strtobool, default=False,
+                        help='Whether to output UDP generated from UDP detector, \
+                              this is meaningful ONLY when test_input_poses_images \
+                              is not UDP sequences but pose images. Meanwhile, \
+                              test_pose_use_parser_udp need to be True')
+    # UDP detector settings
+    parser.add_argument('--test_pose_use_parser_udp',
+                        type=strtobool, default=False,
+                        help='Whether to use UDP detector to generate UDP from pngs, \
+                              pose input MUST be pose images instead of UDP sequences \
+                              while True')
+    args = parser.parse_args()
+    args.distributed = (args.world_size > 1)
+    if args.local_rank == 0:
+        print("batch_size:", args.batch_size, flush=True)
+    if args.distributed:
+        if args.local_rank == 0:
+            print("world_size: ", args.world_size)
+        torch.distributed.init_process_group(
+            backend="nccl", init_method="env://", world_size=args.world_size)
+        torch.cuda.set_device(args.local_rank)
+        torch.backends.cudnn.benchmark = True
+    else:
+        args.local_rank = 0
+    return args
+if __name__ == "__main__":
+    args = build_args()
+    test()