Spaces:

omeregev
/

click2mask

Sleeping

App Files Files Community

click2mask / ldm /image_reconstruction.py

omeregev

Initial commit

6df18f5 10 days ago

raw

history blame contribute delete

12.8 kB

	import argparse
	import copy
	import os
	from pathlib import Path

	from PIL import Image
	import lpips
	from tqdm import tqdm
	import matplotlib.pyplot as plt
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from einops import rearrange
	from general_utils.seamless_cloning import poisson_seamless_clone
	from omegaconf import OmegaConf
	from torch import optim
	from torch.utils.data.dataloader import DataLoader
	from torch.utils.data.dataset import Dataset
	from torchvision.transforms import Compose, Resize, ToTensor
	from torchvision.utils import make_grid

	from ldm.image_editor import load_model_from_config, read_image, read_mask
	from ldm.models.diffusion.ddpm import LatentDiffusion


	class ImagesDataset(Dataset):
	def __init__(self, source_path, transform, indices=None):
	self.source_path = Path(source_path)
	self.img_names = os.listdir(source_path)
	self.img_names.sort()
	if indices is not None:
	self.img_names = [self.img_names[i] for i in indices]
	self.transform = transform

	def __len__(self):
	return len(self.img_names)

	def __getitem__(self, idx):
	image = Image.open(self.source_path / self.img_names[idx]).convert("RGB")
	tensor_image = self.transform(image)
	tensor_image = tensor_image * 2.0 - 1.0

	return tensor_image


	class ImageReconstruction:
	def __init__(
	self,
	verbose: bool = False,
	):
	self.opt = self.get_arguments()
	config = OmegaConf.load("configs/latent-diffusion/txt2img-1p4B-eval.yaml")
	self.device = (
	torch.device(f"cuda:{self.opt.gpu_id}")
	if torch.cuda.is_available()
	else torch.device("cpu")
	)
	self.model = load_model_from_config(
	config=config, ckpt="models/ldm/text2img-large/model.ckpt", device=self.device
	)
	self.model = self.model.to(self.device)

	img_size = (self.opt.W, self.opt.H)
	mask_size = (self.opt.W // 8, self.opt.H // 8)
	self.init_image = read_image(
	img_path=self.opt.init_image, device=self.device, dest_size=img_size
	)
	self.mask, self.org_mask = read_mask(
	mask_path=self.opt.mask, device=self.device, dest_size=mask_size, img_size=img_size
	)
	if self.opt.invert_mask:
	self.mask = 1 - self.mask
	self.org_mask = 1 - self.org_mask

	self.verbose = verbose
	# self.lpips_model = lpips.LPIPS(net="vgg").to(model.device)

	samples_dataset = ImagesDataset(
	source_path=os.path.join(self.opt.images_path, "images"),
	transform=ToTensor(),
	indices=self.opt.selected_indices,
	)

	reconstructed_samples = self._reconstruct_background(samples_dataset)
	self._save_visualization(reconstructed_samples)

	def get_arguments(self):
	parser = argparse.ArgumentParser()

	parser.add_argument("--init_image", type=str, default="", help="a source image to edit")
	parser.add_argument("--mask", type=str, default="", help="a mask to edit the image")
	parser.add_argument(
	"--invert_mask",
	help="Indicator enabling inverting the input mask",
	action="store_true",
	dest="invert_mask",
	)
	parser.add_argument(
	"--images_path",
	type=str,
	default="outputs/edit_results/samples/",
	help="The path for the images to reconstruct",
	)
	parser.add_argument(
	"--H",
	type=int,
	default=256,
	help="image height, in pixel space",
	)

	parser.add_argument(
	"--W",
	type=int,
	default=256,
	help="image width, in pixel space",
	)

	parser.add_argument(
	"--batch_size",
	type=int,
	default=16,
	help="The number of optimization steps in case of optimization",
	)

	parser.add_argument(
	"--optimization_steps",
	type=int,
	default=75,
	help="The number of optimization steps in case of optimization",
	)

	parser.add_argument(
	"--reconstruction_type",
	type=str,
	help="The background reconstruction type",
	default="optimization",
	choices=["optimization", "pixel", "poisson"],
	)
	parser.add_argument(
	"--optimization_mode",
	type=str,
	help="The optimization mode in case of optimization reconstruction type",
	default="weights",
	choices=["weights", "latents"],
	)
	parser.add_argument(
	"--selected_indices",
	type=int,
	nargs="+",
	default=None,
	help="The indices to reconstruct, if not given - will reconstruct all the images",
	)

	# Misc
	parser.add_argument(
	"--gpu_id",
	type=int,
	default=0,
	help="The GPU specific id",
	)

	opt = parser.parse_args()

	return opt

	def _reconstruct_background(self, samples):
	reconstructed_samples = []

	if self.opt.reconstruction_type == "pixel":
	for sample in samples:
	sample = sample.to(self.device) * self.org_mask[0] + self.init_image * (
	1 - self.org_mask[0]
	)
	sample = torch.clamp((sample + 1.0) / 2.0, min=0.0, max=1.0)
	reconstructed_samples.append(sample)
	elif self.opt.reconstruction_type == "poisson":
	mask_numpy = self.org_mask.squeeze().cpu().numpy()
	init_image_numpy = rearrange(
	((self.init_image + 1) / 2).squeeze().cpu().numpy(), "c h w -> h w c"
	)

	for sample in samples:
	sample = torch.clamp((sample + 1.0) / 2.0, min=0.0, max=1.0)
	curr_sample = rearrange(sample.cpu().numpy(), "c h w -> h w c")
	cloned_sample = poisson_seamless_clone(
	source_image=curr_sample,
	destination_image=init_image_numpy,
	mask=mask_numpy,
	)
	cloned_sample = torch.from_numpy(
	cloned_sample[np.newaxis, ...].transpose(0, 3, 1, 2)
	).to(self.device)
	reconstructed_samples.append(cloned_sample)
	elif self.opt.reconstruction_type == "optimization":
	for sample in samples:
	optimized_sample = self.reconstruct_image_by_optimization(
	fg_image=sample.to(self.device).unsqueeze(0),
	bg_image=self.init_image,
	mask=self.org_mask,
	)
	optimized_sample = torch.clamp(optimized_sample, min=0.0, max=1.0)
	reconstructed_samples.append(optimized_sample)
	else:
	raise ValueError("Missing reconstruction type")

	reconstructed_samples = torch.cat(reconstructed_samples)
	return reconstructed_samples

	def loss(
	self,
	fg_image: torch.Tensor,
	bg_image: torch.Tensor,
	curr_latent: torch.Tensor,
	mask: torch.Tensor,
	preservation_ratio: float = 100,
	):
	curr_reconstruction = self.model.decode_first_stage(curr_latent)
	loss = (
	F.mse_loss(fg_image * mask, curr_reconstruction * mask)
	+ F.mse_loss(bg_image * (1 - mask), curr_reconstruction * (1 - mask))
	* preservation_ratio
	)
	# loss = self.lpips_model(fg_image * mask, curr_reconstruction * mask).sum() + \
	# self.lpips_model(bg_image * (1 - mask), curr_reconstruction * (1 - mask)).sum()

	return loss

	@torch.no_grad()
	def get_curr_reconstruction(self, curr_latent):
	curr_reconstruction = self.model.decode_first_stage(curr_latent)
	curr_reconstruction = torch.clamp((curr_reconstruction + 1.0) / 2.0, min=0.0, max=1.0)

	return curr_reconstruction

	@torch.no_grad()
	def plot_reconstructed_image(self, curr_latent, fg_image, bg_image, mask):
	curr_reconstruction = self.get_curr_reconstruction(curr_latent=curr_latent)
	curr_reconstruction = curr_reconstruction[0].cpu().numpy().transpose(1, 2, 0)

	fg_image = torch.clamp((fg_image + 1.0) / 2.0, min=0.0, max=1.0)
	fg_image = fg_image[0].cpu().numpy().transpose(1, 2, 0)

	bg_image = torch.clamp((bg_image + 1.0) / 2.0, min=0.0, max=1.0)
	bg_image = bg_image[0].cpu().numpy().transpose(1, 2, 0)

	mask = mask[0].detach().cpu().numpy().transpose(1, 2, 0)
	composed = fg_image * mask + bg_image * (1 - mask)

	plt.imshow(np.hstack([bg_image, fg_image, composed, curr_reconstruction]))
	plt.axis("off")
	plt.tight_layout()
	plt.show()

	def _save_visualization(self, samples, images_per_row: int = 6):
	self._save_images(samples)

	# Add source image and mask to visualization
	if self.init_image is not None:
	blank_image = torch.ones_like(self.init_image)
	if self.mask is None:
	self.org_mask = blank_image
	resized_mask = blank_image
	else:
	self.org_mask = self.org_mask.repeat((1, 3, 1, 1))

	resized_mask = F.interpolate(self.mask, size=(self.opt.H, self.opt.W))
	resized_mask = resized_mask.repeat((1, 3, 1, 1))

	encoder_posterior = self.model.encode_first_stage(self.init_image)
	encoder_result = self.model.get_first_stage_encoding(encoder_posterior)
	reconstructed_image = self.model.decode_first_stage(encoder_result)
	reconstructed_image = torch.clamp((reconstructed_image + 1.0) / 2.0, min=0.0, max=1.0)

	inputs_row = [
	(self.init_image + 1) / 2,
	reconstructed_image,
	self.org_mask,
	resized_mask,
	]
	pad_row = [blank_image for _ in range(images_per_row - len(inputs_row))]
	inputs_row = inputs_row + pad_row

	samples = torch.cat([torch.cat(inputs_row), samples])

	grid = make_grid(samples, nrow=images_per_row)
	grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy()
	Image.fromarray(grid.astype(np.uint8)).save(
	os.path.join(self.opt.images_path, f"reconstructed_{self.opt.reconstruction_type}.png")
	)

	def _save_images(self, samples):
	samples_dir = os.path.join(
	self.opt.images_path,
	f"reconstructed_{self.opt.reconstruction_type}",
	)
	os.makedirs(samples_dir, exist_ok=True)
	for i, sample in enumerate(samples):
	sample = 255.0 * rearrange(sample.cpu().numpy(), "c h w -> h w c")
	Image.fromarray(sample.astype(np.uint8)).save(os.path.join(samples_dir, f"{i:04}.png"))

	def reconstruct_image_by_optimization(
	self, fg_image: torch.Tensor, bg_image: torch.Tensor, mask: torch.Tensor
	):
	encoder_posterior = self.model.encode_first_stage(fg_image)
	initial_latent = self.model.get_first_stage_encoding(encoder_posterior)

	if self.opt.optimization_mode == "weights":
	curr_latent = initial_latent.clone().detach()
	decoder_copy = copy.deepcopy(self.model.first_stage_model.decoder)
	self.model.first_stage_model.decoder.requires_grad_(True)
	optimizer = optim.Adam(self.model.first_stage_model.decoder.parameters(), lr=0.0001)
	else:
	curr_latent = initial_latent.clone().detach().requires_grad_(True)
	optimizer = optim.Adam([curr_latent], lr=0.1)

	for i in tqdm(range(self.opt.optimization_steps), desc="Reconstruction optimization"):
	if self.verbose and i % 25 == 0:
	self.plot_reconstructed_image(
	curr_latent=curr_latent,
	fg_image=fg_image,
	bg_image=bg_image,
	mask=mask,
	)
	optimizer.zero_grad()

	loss = self.loss(
	fg_image=fg_image, bg_image=bg_image, curr_latent=curr_latent, mask=mask
	)

	if self.verbose:
	print(f"Iteration {i}: Curr loss is {loss}")

	loss.backward()
	optimizer.step()

	reconstructed_result = self.get_curr_reconstruction(curr_latent=curr_latent)

	if self.opt.optimization_mode == "weights":
	self.model.first_stage_model.decoder = None
	self.model.first_stage_model.decoder = decoder_copy

	return reconstructed_result