Spaces:

ds1david
/

sculpt

Runtime error

App Files Files Community

ds1david commited on Mar 18

Commit

1eb87a5

1 Parent(s): cda6ad2

New logic

Browse files

Files changed (13) hide show

app.py +72 -118
model/__init__.py +2 -0
model/convnext.py +55 -0
model/edsr.py +122 -0
model/hyper.py +41 -0
model/init.py +24 -0
model/rdn.py +72 -0
model/swin_ir.py +532 -0
model/tail.py +18 -0
model/thera.py +175 -0
requirements.txt +36 -5
super_resolve.py +99 -0
utils.py +36 -0

app.py CHANGED Viewed

@@ -1,137 +1,91 @@
 import gradio as gr
 import torch
 import numpy as np
 from PIL import Image
-from peft import PeftModel
 from transformers import DPTFeatureExtractor, DPTForDepthEstimation
-from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
-from torchvision import transforms
-# Configurações iniciais
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
-# --- Carregamento dos Modelos ---
-# 1. Thera: Super Resolução
-def load_thera_model():
-    # Modelo hipotético - ajuste conforme implementação real do Thera
-    model = torch.hub.load('prs-eth/thera', 'thera', trust_repo=True)
-    return model.to(DEVICE)
-# 2. Depth Map com PEFT
-def load_depth_model():
-    base_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
-    model = PeftModel.from_pretrained(base_model, "danube2024/dpt-peft-lora")
-    return model.to(DEVICE).eval()
-# 3. Bas-Relief com ControlNet
-def load_controlnet():
-    controlnet = ControlNetModel.from_pretrained(
-        "danube2024/controlnet-bas-relief",
-        torch_dtype=TORCH_DTYPE
-    )
-    pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        controlnet=controlnet,
-        torch_dtype=TORCH_DTYPE
-    )
-    pipe.load_lora_weights("danube2024/bas-relief-lora")
-    return pipe.to(DEVICE)
-# --- Processamento ---
-def run_thera(image, model):
-    transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize([0.5], [0.5])
-    ])
-    input_tensor = transform(image).unsqueeze(0).to(DEVICE)
-    with torch.no_grad():
-        output = model(input_tensor)
-    output_img = transforms.ToPILImage()(output.squeeze().cpu().clamp(-1, 1) * 0.5 + 0.5)
-    return output_img
-def create_depth_map(image, model, feature_extractor):
-    inputs = feature_extractor(images=image, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
-        outputs = model(**inputs)
-        predicted_depth = outputs.predicted_depth
-    prediction = torch.nn.functional.interpolate(
-        predicted_depth.unsqueeze(1),
-        size=image.size[::-1],
-        mode="bicubic",
-        align_corners=False,
-    )
-    return prediction.squeeze().cpu().numpy()
-def create_bas_relief(prompt, image, depth_map, pipe):
-    control_image = Image.fromarray((depth_map * 255).astype(np.uint8))
-    image = image.resize((1024, 1024))
-    control_image = control_image.resize((1024, 1024))
-    result = pipe(
-        prompt=prompt,
-        image=image,
-        control_image=control_image,
-        strength=0.8,
-        num_inference_steps=30
-    ).images[0]
-    return result
-# --- Interface Gradio ---
-with gr.Blocks() as app:
-    gr.Markdown("# 🖼️ Super Resolução + Depth Map + Bas-Relief")
     with gr.Row():
         with gr.Column():
-            input_image = gr.Image(type="pil", label="Imagem de Entrada")
-            prompt = gr.Textbox("high quality bas-relief sculpture, intricate details")
-            submit_btn = gr.Button("Processar")
         with gr.Column():
-            upscaled_output = gr.Image(label="Imagem Super Resolvida")
-            depth_output = gr.Image(label="Mapa de Profundidade")
-            basrelief_output = gr.Image(label="Resultado Bas-Relief")
-    def process(image, prompt):
-        # Carregar modelos
-        thera_model = load_thera_model()
-        depth_model = load_depth_model()
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
-        basrelief_pipe = load_controlnet()
-        # 1. Super Resolução
-        upscaled = run_thera(image, thera_model)
-        # 2. Depth Map
-        depth = create_depth_map(upscaled, depth_model, feature_extractor)
-        depth_normalized = (depth - depth.min()) / (depth.max() - depth.min())
-        # 3. Bas-Relief
-        basrelief = create_bas_relief(prompt, upscaled, depth_normalized, basrelief_pipe)
-        return upscaled, depth_normalized, basrelief
-    submit_btn.click(
-        process,
-        inputs=[input_image, prompt],
-        outputs=[upscaled_output, depth_output, basrelief_output]
     )
 if __name__ == "__main__":

 import gradio as gr
 import torch
+import jax
 import numpy as np
 from PIL import Image
+from diffusers import StableDiffusionXLImg2ImgPipeline
 from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+from super_resolve import process as thera_process  # Assume imports do Thera
+# Configurações
+DEVICE = "cpu"  # ou "cuda" se disponível
+JAX_DEVICE = jax.devices("cpu")[0]  # Usar CPU para JAX
+# 1. Carregar modelos do Thera (EDSR/RDN)
+# (Implementar conforme código original do Thera)
+model_edsr, params_edsr = None, None  # Carregar usando pickle/HF Hub
+# 2. Carregar SDXL Img2Img + LoRA
+print("Carregando SDXL Img2Img com LoRA...")
+pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float32
+).to(DEVICE)
+pipe.load_lora_weights("KappaNeuro/bas-relief", weight_name="BAS-RELIEF.safetensors")
+# 3. Carregar modelo de profundidade
+print("Carregando DPT...")
+feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
+depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(DEVICE)
+def enhance_depth_map(depth_arr):
+    depth_normalized = (depth_arr - depth_arr.min()) / (depth_arr.max() - depth_arr.min() + 1e-8)
+    return Image.fromarray((depth_normalized * 255).astype(np.uint8))
+def full_pipeline(image, prompt, scale_factor=2.0):
+    # 1. Super Resolução com Thera
+    source = np.array(image) / 255.0
+    target_shape = (int(image.height * scale_factor), int(image.width * scale_factor))
+    upscaled = thera_process(source, model_edsr, params_edsr, target_shape, do_ensemble=True)
+    upscaled_pil = Image.fromarray((upscaled * 255).astype(np.uint8))
+    # 2. Gerar Bas-Relief com SDXL Img2Img
+    full_prompt = f"BAS-RELIEF {prompt}, intricate carving, marble relief"
+    bas_relief = pipe(
+        prompt=full_prompt,
+        image=upscaled_pil,
+        strength=0.7,
+        num_inference_steps=25,
+        guidance_scale=7.5
+    ).images[0]
+    # 3. Calcular Depth Map
+    inputs = feature_extractor(bas_relief, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        outputs = depth_model(**inputs)
+        depth = outputs.predicted_depth
+    depth_map = torch.nn.functional.interpolate(
+        depth.unsqueeze(1),
+        size=bas_relief.size[::-1],
+        mode="bicubic"
+    ).squeeze().cpu().numpy()
+    return upscaled_pil, bas_relief, enhance_depth_map(depth_map)
+# Interface Gradio
+with gr.Blocks(title="Super Resolução + Bas-Relief") as app:
+    gr.Markdown("## 📈 Super Resolução + 🗿 Bas-Relief + 🗺️ Mapa de Profundidade")
     with gr.Row():
         with gr.Column():
+            img_input = gr.Image(type="pil", label="Imagem de Entrada")
+            prompt = gr.Textbox("ancient sculpture, marble", label="Descrição do Relevo")
+            scale = gr.Slider(1.0, 4.0, value=2.0, label="Fator de Escala")
+            btn = gr.Button("Processar")
         with gr.Column():
+            img_upscaled = gr.Image(label="Imagem Super Resolvida")
+            img_basrelief = gr.Image(label="Relevo Escultural")
+            img_depth = gr.Image(label="Mapa de Profundidade")
+    btn.click(
+        full_pipeline,
+        inputs=[img_input, prompt, scale],
+        outputs=[img_upscaled, img_basrelief, img_depth]
     )
 if __name__ == "__main__":

model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .hyper import Hypernetwork
2	+ from .thera import build_thera

model/convnext.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import flax.linen as nn
+from jaxtyping import Array, ArrayLike
+class ConvNeXtBlock(nn.Module):
+    """ConvNext block. See Fig.4 in "A ConvNet for the 2020s" by Liu et al.
+    https://openaccess.thecvf.com/content/CVPR2022/papers/Liu_A_ConvNet_for_the_2020s_CVPR_2022_paper.pdf
+    """
+    n_dims: int = 64
+    kernel_size: int = 3  # 7 in the paper's version
+    group_features: bool = False
+    def setup(self) -> None:
+        self.residual = nn.Sequential([
+            nn.Conv(self.n_dims, kernel_size=(self.kernel_size, self.kernel_size), use_bias=False,
+                    feature_group_count=self.n_dims if self.group_features else 1),
+            nn.LayerNorm(),
+            nn.Conv(4 * self.n_dims, kernel_size=(1, 1)),
+            nn.gelu,
+            nn.Conv(self.n_dims, kernel_size=(1, 1)),
+        ])
+    def __call__(self, x: ArrayLike) -> Array:
+        return x + self.residual(x)
+class Projection(nn.Module):
+    n_dims: int
+    @nn.compact
+    def __call__(self, x: ArrayLike) -> Array:
+        x = nn.LayerNorm()(x)
+        x = nn.Conv(self.n_dims, (1, 1))(x)
+        return x
+class ConvNeXt(nn.Module):
+    block_defs: list[tuple]
+    def setup(self) -> None:
+        layers = []
+        current_size = self.block_defs[0][0]
+        for block_def in self.block_defs:
+            if block_def[0] != current_size:
+                layers.append(Projection(block_def[0]))
+            layers.append(ConvNeXtBlock(*block_def))
+            current_size = block_def[0]
+        self.layers = layers
+    def __call__(self, x: ArrayLike, _: bool) -> Array:
+        for layer in self.layers:
+            x = layer(x)
+        return x

model/edsr.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# from https://github.com/isaaccorley/jax-enhance
+from functools import partial
+from typing import Any, Sequence, Callable
+import jax.numpy as jnp
+import flax.linen as nn
+from flax.core.frozen_dict import freeze
+import einops
+class PixelShuffle(nn.Module):
+    scale_factor: int
+    def setup(self):
+        self.layer = partial(
+            einops.rearrange,
+            pattern="b h w (c h2 w2) -> b (h h2) (w w2) c",
+            h2=self.scale_factor,
+            w2=self.scale_factor
+        )
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        return self.layer(x)
+class ResidualBlock(nn.Module):
+    channels: int
+    kernel_size: Sequence[int]
+    res_scale: float
+    activation: Callable
+    dtype: Any = jnp.float32
+    def setup(self):
+        self.body = nn.Sequential([
+            nn.Conv(features=self.channels, kernel_size=self.kernel_size, dtype=self.dtype),
+            self.activation,
+            nn.Conv(features=self.channels, kernel_size=self.kernel_size, dtype=self.dtype),
+        ])
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        return x + self.body(x)
+class UpsampleBlock(nn.Module):
+    num_upsamples: int
+    channels: int
+    kernel_size: Sequence[int]
+    dtype: Any = jnp.float32
+    def setup(self):
+        layers = []
+        for _ in range(self.num_upsamples):
+            layers.extend([
+                nn.Conv(features=self.channels * 2 ** 2, kernel_size=self.kernel_size, dtype=self.dtype),
+                PixelShuffle(scale_factor=2),
+            ])
+        self.layers = layers
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class EDSR(nn.Module):
+    """Enhanced Deep Residual Networks for Single Image Super-Resolution https://arxiv.org/pdf/1707.02921v1.pdf"""
+    scale_factor: int
+    channels: int = 3
+    num_blocks: int = 32
+    num_feats: int = 256
+    dtype: Any = jnp.float32
+    def setup(self):
+        # pre res blocks layer
+        self.head = nn.Sequential([nn.Conv(features=self.num_feats, kernel_size=(3, 3), dtype=self.dtype)])
+        # res blocks
+        res_blocks = [
+            ResidualBlock(channels=self.num_feats, kernel_size=(3, 3), res_scale=0.1, activation=nn.relu, dtype=self.dtype)
+            for i in range(self.num_blocks)
+        ]
+        res_blocks.append(nn.Conv(features=self.num_feats, kernel_size=(3, 3), dtype=self.dtype))
+        self.body = nn.Sequential(res_blocks)
+    def __call__(self, x: jnp.ndarray, _=None) -> jnp.ndarray:
+        x = self.head(x)
+        x = x + self.body(x)
+        return x
+def convert_edsr_checkpoint(torch_dict, no_upsampling=True):
+    def convert(in_dict):
+        top_keys = set([k.split('.')[0] for k in in_dict.keys()])
+        leaves = set([k for k in in_dict.keys() if '.' not in k])
+        # convert leaves
+        out_dict = {}
+        for l in leaves:
+            if l == 'weight':
+                out_dict['kernel'] = jnp.asarray(in_dict[l]).transpose((2, 3, 1, 0))
+            elif l == 'bias':
+                out_dict[l] = jnp.asarray(in_dict[l])
+            else:
+                out_dict[l] = in_dict[l]
+        for top_key in top_keys.difference(leaves):
+            new_top_key = 'layers_' + top_key if top_key.isdigit() else top_key
+            out_dict[new_top_key] = convert(
+                {k[len(top_key) + 1:]: v for k, v in in_dict.items() if k.startswith(top_key)})
+        return out_dict
+    converted = convert(torch_dict)
+    # remove unwanted keys
+    if no_upsampling:
+        del converted['tail']
+    for k in ('add_mean', 'sub_mean'):
+        del converted[k]
+    return freeze(converted)

model/hyper.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import math
+import jax
+import jax.numpy as jnp
+import flax.linen as nn
+from jaxtyping import Array, ArrayLike, PyTreeDef
+import numpy as np
+from utils import interpolate_grid
+class Hypernetwork(nn.Module):
+    encoder: nn.Module
+    refine: nn.Module
+    output_params_shape: list[tuple]  # e.g. [(16,), (32, 32), ...]
+    tree_def: PyTreeDef  # used to reconstruct the parameter sets
+    def setup(self):
+        # one layer 1x1 conv to calculate field params, as in SIREN paper
+        output_size = sum(math.prod(s) for s in self.output_params_shape)
+        self.out_conv = nn.Conv(output_size, kernel_size=(1, 1), use_bias=True)
+    def get_encoding(self, source: ArrayLike, training=False) -> Array:
+        """Convenience method for whole-image evaluation"""
+        return self.refine(self.encoder(source, training), training)
+    def get_params_at_coords(self, encoding: ArrayLike, coords: ArrayLike) -> Array:
+        encoding = interpolate_grid(coords, encoding)
+        phi_params = self.out_conv(encoding)
+        # reshape to output params shape
+        phi_params = jnp.split(
+            phi_params, np.cumsum([math.prod(s) for s in self.output_params_shape[:-1]]), axis=-1)
+        phi_params = [jnp.reshape(p, p.shape[:-1] + s) for p, s in
+                      zip(phi_params, self.output_params_shape)]
+        return jax.tree_util.tree_unflatten(self.tree_def, phi_params)
+    def __call__(self, source: ArrayLike, target_coords: ArrayLike, training=False) -> Array:
+        encoding = self.get_encoding(source, training)
+        return self.get_params_at_coords(encoding, target_coords)

model/init.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from typing import Callable
+import jax
+import jax.numpy as jnp
+from jaxtyping import Array
+def uniform_between(a: float, b: float, dtype=jnp.float32) -> Callable:
+    def init(key, shape, dtype=dtype) -> Array:
+        return jax.random.uniform(key, shape, dtype=dtype, minval=a, maxval=b)
+    return init
+def linear_up(scale: float) -> Callable:
+    def init(key, shape, dtype=jnp.float32) -> Array:
+        assert shape[-2] == 2
+        keys = jax.random.split(key, 2)
+        norm = jnp.pi * scale * (
+                jax.random.uniform(keys[0], shape=(1, shape[-1])) ** .5)
+        theta = 2 * jnp.pi * jax.random.uniform(keys[1], shape=(1, shape[-1]))
+        x = norm * jnp.cos(theta)
+        y = norm * jnp.sin(theta)
+        return jnp.concatenate([x, y], axis=-2).astype(dtype)
+    return init

model/rdn.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Residual Dense Network for Image Super-Resolution
+# https://arxiv.org/abs/1802.08797
+# modified from: https://github.com/thstkdgus35/EDSR-PyTorch
+import jax.numpy as jnp
+import flax.linen as nn
+class RDB_Conv(nn.Module):
+    growRate: int
+    kSize: int = 3
+    @nn.compact
+    def __call__(self, x):
+        out = nn.Sequential([
+            nn.Conv(self.growRate, (self.kSize, self.kSize), padding=(self.kSize-1)//2),
+            nn.activation.relu
+        ])(x)
+        return jnp.concatenate((x, out), -1)
+class RDB(nn.Module):
+    growRate0: int
+    growRate: int
+    nConvLayers: int
+    @nn.compact
+    def __call__(self, x):
+        res = x
+        for c in range(self.nConvLayers):
+            x = RDB_Conv(self.growRate)(x)
+        x = nn.Conv(self.growRate0, (1, 1))(x)
+        return x + res
+class RDN(nn.Module):
+    G0: int = 64
+    RDNkSize: int = 3
+    RDNconfig: str = 'B'
+    scale: int = 2
+    n_colors: int = 3
+    @nn.compact
+    def __call__(self, x, _=None):
+        D, C, G = {
+            'A': (20, 6, 32),
+            'B': (16, 8, 64),
+        }[self.RDNconfig]
+        # Shallow feature extraction
+        f_1 = nn.Conv(self.G0, (self.RDNkSize, self.RDNkSize))(x)
+        x = nn.Conv(self.G0, (self.RDNkSize, self.RDNkSize))(f_1)
+        # Redidual dense blocks and dense feature fusion
+        RDBs_out = []
+        for i in range(D):
+            x = RDB(self.G0, G, C)(x)
+            RDBs_out.append(x)
+        x = jnp.concatenate(RDBs_out, -1)
+        # Global Feature Fusion
+        x = nn.Sequential([
+            nn.Conv(self.G0, (1, 1)),
+            nn.Conv(self.G0, (self.RDNkSize, self.RDNkSize))
+        ])(x)
+        x = x + f_1
+        return x

model/swin_ir.py ADDED Viewed

	@@ -0,0 +1,532 @@

+import math
+from typing import Callable, Optional, Iterable
+import numpy as np
+import jax
+import jax.numpy as jnp
+import flax.linen as nn
+from jaxtyping import Array
+def trunc_normal(mean=0., std=1., a=-2., b=2., dtype=jnp.float32) -> Callable:
+    """Truncated normal initialization function"""
+    def init(key, shape, dtype=dtype) -> Array:
+        # https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/weight_init.py
+        def norm_cdf(x):
+            # Computes standard normal cumulative distribution function
+            return (1. + math.erf(x / math.sqrt(2.))) / 2.
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        out = jax.random.uniform(key, shape, dtype=dtype, minval=2 * l - 1, maxval=2 * u - 1)
+        out = jax.scipy.special.erfinv(out) * std * math.sqrt(2.) + mean
+        return jnp.clip(out, a, b)
+    return init
+def Dense(features, use_bias=True, kernel_init=trunc_normal(std=.02), bias_init=nn.initializers.zeros):
+    return nn.Dense(features, use_bias=use_bias, kernel_init=kernel_init, bias_init=bias_init)
+def LayerNorm():
+    """torch LayerNorm uses larger epsilon by default"""
+    return nn.LayerNorm(epsilon=1e-05)
+class Mlp(nn.Module):
+    in_features: int
+    hidden_features: int = None
+    out_features: int = None
+    act_layer: Callable = nn.gelu
+    drop: float = 0.0
+    @nn.compact
+    def __call__(self, x, training: bool):
+        x = nn.Dense(self.hidden_features or self.in_features)(x)
+        x = self.act_layer(x)
+        x = nn.Dropout(self.drop, deterministic=not training)(x)
+        x = nn.Dense(self.out_features or self.in_features)(x)
+        x = nn.Dropout(self.drop, deterministic=not training)(x)
+        return x
+def window_partition(x, window_size: int):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape((B, H // window_size, window_size, W // window_size, window_size, C))
+    windows = x.transpose((0, 1, 3, 2, 4, 5)).reshape((-1, window_size, window_size, C))
+    return windows
+def window_reverse(windows, window_size: int, H: int, W: int):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape((B, H // window_size, W // window_size, window_size, window_size, -1))
+    x = x.transpose((0, 1, 3, 2, 4, 5)).reshape((B, H, W, -1))
+    return x
+class DropPath(nn.Module):
+    """
+    Implementation referred from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+    """
+    dropout_prob: float = 0.1
+    deterministic: Optional[bool] = None
+    @nn.compact
+    def __call__(self, input, training):
+        if not training:
+            return input
+        keep_prob = 1 - self.dropout_prob
+        shape = (input.shape[0],) + (1,) * (input.ndim - 1)
+        rng = self.make_rng("dropout")
+        random_tensor = keep_prob + jax.random.uniform(rng, shape)
+        random_tensor = jnp.floor(random_tensor)
+        return jnp.divide(input, keep_prob) * random_tensor
+class WindowAttention(nn.Module):
+    dim: int
+    window_size: Iterable[int]
+    num_heads: int
+    qkv_bias: bool = True
+    qk_scale: Optional[float] = None
+    att_drop: float = 0.0
+    proj_drop: float = 0.0
+    def make_rel_pos_index(self):
+        h_indices = np.arange(0, self.window_size[0])
+        w_indices = np.arange(0, self.window_size[1])
+        indices = np.stack(np.meshgrid(w_indices, h_indices, indexing="ij"))
+        flatten_indices = np.reshape(indices, (2, -1))
+        relative_indices = flatten_indices[:, :, None] - flatten_indices[:, None, :]
+        relative_indices = np.transpose(relative_indices, (1, 2, 0))
+        relative_indices[:, :, 0] += self.window_size[0] - 1
+        relative_indices[:, :, 1] += self.window_size[1] - 1
+        relative_indices[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_pos_index = np.sum(relative_indices, -1)
+        return relative_pos_index
+    @nn.compact
+    def __call__(self, inputs, mask, training):
+        rpbt = self.param(
+            "relative_position_bias_table",
+            trunc_normal(std=.02),
+            (
+                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1),
+                self.num_heads,
+            ),
+        )
+        #relative_pos_index = self.variable(
+        #    "variables", "relative_position_index", self.get_rel_pos_index
+        #)
+        batch, n, channels = inputs.shape
+        qkv = nn.Dense(self.dim * 3, use_bias=self.qkv_bias, name="qkv")(inputs)
+        qkv = qkv.reshape(batch, n, 3, self.num_heads, channels // self.num_heads)
+        qkv = jnp.transpose(qkv, (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        scale = self.qk_scale or (self.dim // self.num_heads) ** -0.5
+        q = q * scale
+        att = q @ jnp.swapaxes(k, -2, -1)
+        rel_pos_bias = jnp.reshape(
+            rpbt[np.reshape(self.make_rel_pos_index(), (-1))],
+            (
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1,
+            ),
+        )
+        rel_pos_bias = jnp.transpose(rel_pos_bias, (2, 0, 1))
+        att += jnp.expand_dims(rel_pos_bias, 0)
+        if mask is not None:
+            att = jnp.reshape(
+                att, (batch // mask.shape[0], mask.shape[0], self.num_heads, n, n)
+            )
+            att = att + jnp.expand_dims(jnp.expand_dims(mask, 1), 0)
+            att = jnp.reshape(att, (-1, self.num_heads, n, n))
+            att = jax.nn.softmax(att)
+        else:
+            att = jax.nn.softmax(att)
+        att = nn.Dropout(self.att_drop)(att, deterministic=not training)
+        x = jnp.reshape(jnp.swapaxes(att @ v, 1, 2), (batch, n, channels))
+        x = nn.Dense(self.dim, name="proj")(x)
+        x = nn.Dropout(self.proj_drop)(x, deterministic=not training)
+        return x
+class SwinTransformerBlock(nn.Module):
+    dim: int
+    input_resolution: tuple[int]
+    num_heads: int
+    window_size: int = 7
+    shift_size: int = 0
+    mlp_ratio: float = 4.
+    qkv_bias: bool = True
+    qk_scale: Optional[float] = None
+    drop: float = 0.
+    attn_drop: float = 0.
+    drop_path: float = 0.
+    act_layer: Callable = nn.activation.gelu
+    norm_layer: Callable = LayerNorm
+    @staticmethod
+    def make_att_mask(shift_size, window_size, height, width):
+        if shift_size > 0:
+            mask = jnp.zeros([1, height, width, 1])
+            h_slices = (
+                slice(0, -window_size),
+                slice(-window_size, -shift_size),
+                slice(-shift_size, None),
+            )
+            w_slices = (
+                slice(0, -window_size),
+                slice(-window_size, -shift_size),
+                slice(-shift_size, None),
+            )
+            count = 0
+            for h in h_slices:
+                for w in w_slices:
+                    mask = mask.at[:, h, w, :].set(count)
+                    count += 1
+            mask_windows = window_partition(mask, window_size)
+            mask_windows = jnp.reshape(mask_windows, (-1, window_size * window_size))
+            att_mask = jnp.expand_dims(mask_windows, 1) - jnp.expand_dims(mask_windows, 2)
+            att_mask = jnp.where(att_mask != 0.0, float(-100.0), att_mask)
+            att_mask = jnp.where(att_mask == 0.0, float(0.0), att_mask)
+        else:
+            att_mask = None
+        return att_mask
+    @nn.compact
+    def __call__(self, x, x_size, training):
+        H, W = x_size
+        B, L, C = x.shape
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        shortcut = x
+        x = self.norm_layer()(x)
+        x = x.reshape((B, H, W, C))
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = jnp.roll(x, (-self.shift_size, -self.shift_size), axis=(1, 2))
+        else:
+            shifted_x = x
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape((-1, self.window_size * self.window_size, C))  # nW*B, window_size*window_size, C
+        #attn_mask = self.variable(
+        #    "variables",
+        #    "attn_mask",
+        #    self.get_att_mask,
+        #    self.shift_size,
+        #    self.window_size,
+        #    self.input_resolution[0],
+        #    self.input_resolution[1]
+        #)
+        attn_mask = self.make_att_mask(self.shift_size, self.window_size, *self.input_resolution)
+        attn = WindowAttention(self.dim, (self.window_size, self.window_size), self.num_heads,
+                               self.qkv_bias, self.qk_scale, self.attn_drop, self.drop)
+        if self.input_resolution == x_size:
+            attn_windows = attn(x_windows, attn_mask, training)  # nW*B, window_size*window_size, C
+        else:
+            # test time
+            assert not training
+            test_mask = self.make_att_mask(self.shift_size, self.window_size, *x_size)
+            attn_windows = attn(x_windows, test_mask, training=False)
+        # merge windows
+        attn_windows = attn_windows.reshape((-1, self.window_size, self.window_size, C))
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = jnp.roll(shifted_x, (self.shift_size, self.shift_size), axis=(1, 2))
+        else:
+            x = shifted_x
+        x = x.reshape((B, H * W, C))
+        # FFN
+        x = shortcut + DropPath(self.drop_path)(x, training)
+        norm = self.norm_layer()(x)
+        mlp = Mlp(in_features=self.dim, hidden_features=int(self.dim * self.mlp_ratio),
+                  act_layer=self.act_layer, drop=self.drop)(norm, training)
+        x = x + DropPath(self.drop_path)(mlp, training)
+        return x
+class PatchMerging(nn.Module):
+    inp_res: Iterable[int]
+    dim: int
+    norm_layer: Callable = LayerNorm
+    @nn.compact
+    def __call__(self, inputs):
+        batch, n, channels = inputs.shape
+        height, width = self.inp_res[0], self.inp_res[1]
+        x = jnp.reshape(inputs, (batch, height, width, channels))
+        x0 = x[:, 0::2, 0::2, :]
+        x1 = x[:, 1::2, 0::2, :]
+        x2 = x[:, 0::2, 1::2, :]
+        x3 = x[:, 1::2, 1::2, :]
+        x = jnp.concatenate([x0, x1, x2, x3], axis=-1)
+        x = jnp.reshape(x, (batch, -1, 4 * channels))
+        x = self.norm_layer()(x)
+        x = nn.Dense(2 * self.dim, use_bias=False)(x)
+        return x
+class BasicLayer(nn.Module):
+    dim: int
+    input_resolution: int
+    depth: int
+    num_heads: int
+    window_size: int
+    mlp_ratio: float = 4.
+    qkv_bias: bool = True
+    qk_scale: Optional[float] = None
+    drop: float = 0.
+    attn_drop: float = 0.
+    drop_path: float = 0.
+    norm_layer: Callable = LayerNorm
+    downsample: Optional[Callable] = None
+    @nn.compact
+    def __call__(self, x, x_size, training):
+        for i in range(self.depth):
+            x = SwinTransformerBlock(
+                self.dim,
+                self.input_resolution,
+                self.num_heads,
+                self.window_size,
+                0 if (i % 2 == 0) else self.window_size // 2,
+                self.mlp_ratio,
+                self.qkv_bias,
+                self.qk_scale,
+                self.drop,
+                self.attn_drop,
+                self.drop_path[i] if isinstance(self.drop_path, (list, tuple)) else self.drop_path,
+                norm_layer=self.norm_layer
+            )(x, x_size, training)
+        if self.downsample is not None:
+            x = self.downsample(self.input_resolution, dim=self.dim, norm_layer=self.norm_layer)(x)
+        return x
+class RSTB(nn.Module):
+    dim: int
+    input_resolution: int
+    depth: int
+    num_heads: int
+    window_size: int
+    mlp_ratio: float = 4.
+    qkv_bias: bool = True
+    qk_scale: Optional[float] = None
+    drop: float = 0.
+    attn_drop: float = 0.
+    drop_path: float = 0.
+    norm_layer: Callable = LayerNorm
+    downsample: Optional[Callable] = None
+    img_size: int = 224,
+    patch_size: int = 4,
+    resi_connection: str = '1conv'
+    @nn.compact
+    def __call__(self, x, x_size, training):
+        res = x
+        x = BasicLayer(dim=self.dim,
+                       input_resolution=self.input_resolution,
+                       depth=self.depth,
+                       num_heads=self.num_heads,
+                       window_size=self.window_size,
+                       mlp_ratio=self.mlp_ratio,
+                       qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
+                       drop=self.drop, attn_drop=self.attn_drop,
+                       drop_path=self.drop_path,
+                       norm_layer=self.norm_layer,
+                       downsample=self.downsample)(x, x_size, training)
+        x = PatchUnEmbed(embed_dim=self.dim)(x, x_size)
+        # resi_connection == '1conv':
+        x = nn.Conv(self.dim, (3, 3))(x)
+        x = PatchEmbed()(x)
+        return x + res
+class PatchEmbed(nn.Module):
+    norm_layer: Optional[Callable] = None
+    @nn.compact
+    def __call__(self, x):
+        x = x.reshape((x.shape[0], -1, x.shape[-1]))  # B Ph Pw C -> B Ph*Pw C
+        if self.norm_layer is not None:
+            x = self.norm_layer()(x)
+        return x
+class PatchUnEmbed(nn.Module):
+    embed_dim: int = 96
+    @nn.compact
+    def __call__(self, x, x_size):
+        B, HW, C = x.shape
+        x = x.reshape((B, x_size[0], x_size[1], self.embed_dim))
+        return x
+class SwinIR(nn.Module):
+    r""" SwinIR JAX implementation
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 64
+        patch_size (int | tuple(int)): Patch size. Default: 1
+        in_chans (int): Number of input image channels. Default: 3
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
+        img_range: Image range. 1. or 25I think5.
+    """
+    img_size: int = 48
+    patch_size: int = 1
+    in_chans: int = 3
+    embed_dim: int = 180
+    depths: tuple = (6, 6, 6, 6, 6, 6)
+    num_heads: tuple = (6, 6, 6, 6, 6, 6)
+    window_size: int = 8
+    mlp_ratio: float = 2.
+    qkv_bias: bool = True
+    qk_scale: Optional[float] = None
+    drop_rate: float = 0.
+    attn_drop_rate: float = 0.
+    drop_path_rate: float = 0.1
+    norm_layer: Callable = LayerNorm
+    ape: bool = False
+    patch_norm: bool = True
+    upscale: int = 2
+    img_range: float = 1.
+    num_feat: int = 64
+    def pad(self, x):
+        _, h, w, _ = x.shape
+        mod_pad_h = (self.window_size - h % self.window_size) % self.window_size
+        mod_pad_w = (self.window_size - w % self.window_size) % self.window_size
+        x = jnp.pad(x, ((0, 0), (0, mod_pad_h), (0, mod_pad_w), (0, 0)), 'reflect')
+        return x
+    @nn.compact
+    def __call__(self, x, training):
+        _, h_before, w_before, _ = x.shape
+        x = self.pad(x)
+        _, h, w, _ = x.shape
+        patches_resolution = [self.img_size // self.patch_size] * 2
+        num_patches = patches_resolution[0] * patches_resolution[1]
+        # conv_first
+        x = nn.Conv(self.embed_dim, (3, 3))(x)
+        res = x
+        # feature extraction
+        x_size = (h, w)
+        x = PatchEmbed(self.norm_layer if self.patch_norm else None)(x)
+        if self.ape:
+            absolute_pos_embed = \
+                self.param('ape', trunc_normal(std=.02), (1, num_patches, self.embed_dim))
+            x = x + absolute_pos_embed
+        x = nn.Dropout(self.drop_rate, deterministic=not training)(x)
+        dpr = [x.item() for x in np.linspace(0, self.drop_path_rate, sum(self.depths))]
+        for i_layer in range(len(self.depths)):
+            x = RSTB(
+                dim=self.embed_dim,
+                input_resolution=(patches_resolution[0], patches_resolution[1]),
+                depth=self.depths[i_layer],
+                num_heads=self.num_heads[i_layer],
+                window_size=self.window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
+                drop=self.drop_rate, attn_drop=self.attn_drop_rate,
+                drop_path=dpr[sum(self.depths[:i_layer]):sum(self.depths[:i_layer + 1])],
+                norm_layer=self.norm_layer,
+                downsample=None,
+                img_size=self.img_size,
+                patch_size=self.patch_size)(x, x_size, training)
+        x = self.norm_layer()(x)  # B L C
+        x = PatchUnEmbed(self.embed_dim)(x, x_size)
+        # conv_after_body
+        x = nn.Conv(self.embed_dim, (3, 3))(x)
+        x = x + res
+        # conv_before_upsample
+        x = nn.activation.leaky_relu(nn.Conv(self.num_feat, (3, 3))(x))
+        # revert padding
+        x = x[:, :-(h - h_before) or None, :-(w - w_before) or None]
+        return x

model/tail.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import flax.linen as nn
+from .convnext import ConvNeXt
+from .swin_ir import SwinIR
+def build_tail(size: str):
+    """ Convenience function to build the three tails described in the paper. """
+    if size == 'air':
+        return lambda x, _: x
+    elif size == 'plus':
+        blocks = [(64, 3, True)] * 6 + [(96, 3, True)] * 7 + [(128, 3, True)] * 3
+        return ConvNeXt(blocks)
+    elif size == 'pro':
+        return SwinIR(depths=[7, 6], num_heads=[6, 6])
+    else:
+        raise NotImplementedError('size: ' + size)

model/thera.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import math
+import jax
+from flax.core import unfreeze, freeze
+import jax.numpy as jnp
+import flax.linen as nn
+from jaxtyping import Array, ArrayLike, PyTree
+from .edsr import EDSR
+from .rdn import RDN
+from .hyper import Hypernetwork
+from .tail import build_tail
+from .init import uniform_between, linear_up
+from utils import make_grid, interpolate_grid, repeat_vmap
+class Thermal(nn.Module):
+    w0_scale: float = 1.
+    @nn.compact
+    def __call__(self, x: ArrayLike, t, norm, k) -> Array:
+        phase = self.param('phase', nn.initializers.uniform(.5), x.shape[-1:])
+        return jnp.sin(self.w0_scale * x + phase) * jnp.exp(-(self.w0_scale * norm)**2 * k * t)
+class TheraField(nn.Module):
+    dim_hidden: int
+    dim_out: int
+    w0: float = 1.
+    c: float = 6.
+    @nn.compact
+    def __call__(self, x: ArrayLike, t: ArrayLike, k: ArrayLike, components: ArrayLike) -> Array:
+        # coordinate projection according to shared components ("first layer")
+        x = x @ components
+        # thermal activations
+        norm = jnp.linalg.norm(components, axis=-2)
+        x = Thermal(self.w0)(x, t, norm, k)
+        # linear projection from hidden to output space ("second layer")
+        w_std = math.sqrt(self.c / self.dim_hidden) / self.w0
+        dense_init_fn = uniform_between(-w_std, w_std)
+        x = nn.Dense(self.dim_out, kernel_init=dense_init_fn, use_bias=False)(x)
+        return x
+class Thera:
+    def __init__(
+            self,
+            hidden_dim: int,
+            out_dim: int,
+            backbone: nn.Module,
+            tail: nn.Module,
+            k_init: float = None,
+            components_init_scale: float = None
+    ):
+        self.hidden_dim = hidden_dim
+        self.k_init = k_init
+        self.components_init_scale = components_init_scale
+        # single TheraField object whose `apply` method is used for all grid cells
+        self.field = TheraField(hidden_dim, out_dim)
+        # infer output size of the hypernetwork from a sample pass through the field;
+        # key doesnt matter as field params are only used for size inference
+        sample_params = self.field.init(jax.random.PRNGKey(0),
+            jnp.zeros((2,)), 0., 0., jnp.zeros((2, hidden_dim)))
+        sample_params_flat, tree_def = jax.tree_util.tree_flatten(sample_params)
+        param_shapes = [p.shape for p in sample_params_flat]
+        self.hypernet = Hypernetwork(backbone, tail, param_shapes, tree_def)
+    def init(self, key, sample_source) -> PyTree:
+        keys = jax.random.split(key, 2)
+        sample_coords = jnp.zeros(sample_source.shape[:-1] + (2,))
+        params = unfreeze(self.hypernet.init(keys[0], sample_source, sample_coords))
+        params['params']['k'] = jnp.array(self.k_init)
+        params['params']['components'] = \
+            linear_up(self.components_init_scale)(keys[1], (2, self.hidden_dim))
+        return freeze(params)
+    def apply_encoder(self, params: PyTree, source: ArrayLike, **kwargs) -> Array:
+        """
+        Performs a forward pass through the hypernetwork to obtain an encoding.
+        """
+        return self.hypernet.apply(
+            params, source, method=self.hypernet.get_encoding, **kwargs)
+    def apply_decoder(
+        self,
+        params: PyTree,
+        encoding: ArrayLike,
+        coords: ArrayLike,
+        t: ArrayLike,
+        return_jac: bool = False
+    ) -> Array | tuple[Array, Array]:
+        """
+        Performs a forward prediction through a grid of HxW Thera fields,
+        informed by `encoding`, at spatial and temporal coordinates
+        `coords` and `t`, respectively.
+        args:
+            params: Field parameters, shape (B, H, W, N)
+            encoding: Encoding tensor, shape (B, H, W, C)
+            coords: Spatial coordinates in [-0.5, 0.5], shape (B, H, W, 2)
+            t: Temporal coordinates, shape (B, 1)
+        """
+        phi_params: PyTree = self.hypernet.apply(
+            params, encoding, coords, method=self.hypernet.get_params_at_coords)
+        # create local coordinate systems
+        source_grid = jnp.asarray(make_grid(encoding.shape[-3:-1]))
+        source_coords = jnp.tile(source_grid, (encoding.shape[0], 1, 1, 1))
+        interp_coords = interpolate_grid(coords, source_coords)
+        rel_coords = (coords - interp_coords)
+        rel_coords = rel_coords.at[..., 0].set(rel_coords[..., 0] * encoding.shape[-3])
+        rel_coords = rel_coords.at[..., 1].set(rel_coords[..., 1] * encoding.shape[-2])
+        # three maps over params, coords; one over t; dont map k and components
+        in_axes = [(0, 0, None, None, None), (0, 0, None, None, None), (0, 0, 0, None, None)]
+        apply_field = repeat_vmap(self.field.apply, in_axes)
+        out = apply_field(phi_params, rel_coords, t, params['params']['k'],
+            params['params']['components'])
+        if return_jac:
+            apply_jac = repeat_vmap(jax.jacrev(self.field.apply, argnums=1), in_axes)
+            jac = apply_jac(phi_params, rel_coords, jnp.zeros_like(t), params['params']['k'],
+                params['params']['components'])
+            return out, jac
+        return out
+    def apply(
+        self,
+        params: ArrayLike,
+        source: ArrayLike,
+        coords: ArrayLike,
+        t: ArrayLike,
+        return_jac: bool = False,
+        **kwargs
+    ) -> Array:
+        """
+        Performs a forward pass through the Thera model.
+        """
+        encoding = self.apply_encoder(params, source, **kwargs)
+        out = self.apply_decoder(params, encoding, coords, t, return_jac=return_jac)
+        return out
+def build_thera(
+    out_dim: int,
+    backbone: str,
+    size: str,
+    k_init: float = None,
+    components_init_scale: float = None
+):
+    """
+    Convenience function for building the three Thera sizes described in the paper.
+    """
+    hidden_dim = 32 if size == 'air' else 512
+    if backbone == 'edsr-baseline':
+        backbone_module = EDSR(None, num_blocks=16, num_feats=64)
+    elif backbone == 'rdn':
+        backbone_module = RDN()
+    else:
+        raise NotImplementedError(backbone)
+    tail_module = build_tail(size)
+    return Thera(hidden_dim, out_dim, backbone_module, tail_module, k_init, components_init_scale)

requirements.txt CHANGED Viewed

@@ -1,6 +1,37 @@
-gradio
-torch
-peft
-transformers
 diffusers
-torchvision

+-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+ConfigArgParse==1.7
+Pillow==10.0.0
+chex==0.1.7
 diffusers
+einops==0.6.1
+flax==0.6.10
+flaxmodels==0.1.3
+jax==0.4.11
+jaxlib==0.4.11+cuda11.cudnn86
+jaxtyping==0.2.20
+ml-dtypes==0.1.0
+numpy==1.24.1
+nvidia-cublas-cu11==11.11.3.6
+nvidia-cuda-cupti-cu11==11.8.87
+nvidia-cuda-nvcc-cu11==11.8.89
+nvidia-cuda-runtime-cu11==11.8.89
+nvidia-cudnn-cu11==8.9.2.26
+nvidia-cufft-cu11==10.9.0.58
+nvidia-cusolver-cu11==11.4.1.48
+nvidia-cusparse-cu11==11.7.5.86
+opt-einsum==3.3.0
+optax==0.2.0
+orbax-checkpoint==0.2.4
+peft
+scipy==1.10.1
+timm==0.9.6
+torch
+torchvision
+tqdm==4.65.0
+transformers==4.46.3
+wandb
+gradio==4.44.1
+gradio_imageslider==0.0.20
+spaces

super_resolve.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/env python
+from argparse import ArgumentParser, Namespace
+import pickle
+import jax
+from jax import jit
+import jax.numpy as jnp
+import numpy as np
+from PIL import Image
+from model import build_thera
+from utils import make_grid, interpolate_grid
+MEAN = jnp.array([.4488, .4371, .4040])
+VAR = jnp.array([.25, .25, .25])
+PATCH_SIZE = 256
+def process_single(source, apply_encoder, apply_decoder, params, target_shape):
+    t = jnp.float32((target_shape[0] / source.shape[1])**-2)[None]
+    coords_nearest = jnp.asarray(make_grid(target_shape)[None])
+    source_up = interpolate_grid(coords_nearest, source[None])
+    source = jax.nn.standardize(source, mean=MEAN, variance=VAR)[None]
+    encoding = apply_encoder(params, source)
+    coords = jnp.asarray(make_grid(source_up.shape[1:3])[None])  # global sampling coords
+    out = jnp.full_like(source_up, jnp.nan, dtype=jnp.float32)
+    for h_min in range(0, coords.shape[1], PATCH_SIZE):
+        h_max = min(h_min + PATCH_SIZE, coords.shape[1])
+        for w_min in range(0, coords.shape[2], PATCH_SIZE):
+            # apply decoder with one patch of coordinates
+            w_max = min(w_min + PATCH_SIZE, coords.shape[2])
+            coords_patch = coords[:, h_min:h_max, w_min:w_max]
+            out_patch = apply_decoder(params, encoding, coords_patch, t)
+            out = out.at[:, h_min:h_max, w_min:w_max].set(out_patch)
+    out = out * jnp.sqrt(VAR)[None, None, None] + MEAN[None, None, None]
+    out += source_up
+    return out
+def process(source, model, params, target_shape, do_ensemble=True):
+    apply_encoder = jit(model.apply_encoder)
+    apply_decoder = jit(model.apply_decoder)
+    outs = []
+    for i_rot in range(4 if do_ensemble else 1):
+        source_ = jnp.rot90(source, k=i_rot, axes=(-3, -2))
+        target_shape_ = tuple(reversed(target_shape)) if i_rot % 2 else target_shape
+        out = process_single(source_, apply_encoder, apply_decoder, params, target_shape_)
+        outs.append(jnp.rot90(out, k=i_rot, axes=(-2, -3)))
+    out = jnp.stack(outs).mean(0).clip(0., 1.)
+    return jnp.rint(out[0] * 255).astype(jnp.uint8)
+def main(args: Namespace):
+    source = np.asarray(Image.open(args.in_file)) / 255.
+    if args.scale is not None:
+        if args.size is not None:
+            raise ValueError('Cannot specify both size and scale')
+        target_shape = (
+            round(source.shape[0] * args.scale),
+            round(source.shape[1] * args.scale),
+        )
+    elif args.size is not None:
+        target_shape = args.size
+    else:
+        raise ValueError('Must specify either size or scale')
+    with open(args.checkpoint, 'rb') as fh:
+        check = pickle.load(fh)
+        params, backbone, size = check['model'], check['backbone'], check['size']
+    model = build_thera(3, backbone, size)
+    out = process(source, model, params, target_shape, not args.no_ensemble)
+    Image.fromarray(np.asarray(out)).save(args.out_file)
+def parse_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument('in_file')
+    parser.add_argument('out_file')
+    parser.add_argument('--scale', type=float, help='Scale factor for super-resolution')
+    parser.add_argument('--size', type=int, nargs=2,
+                        help='Target size (h, w), mutually exclusive with --scale')
+    parser.add_argument('--checkpoint', help='Path to checkpoint file')
+    parser.add_argument('--no-ensemble', action='store_true', help='Disable geo-ensemble')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)

utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from functools import partial
+import jax
+import numpy as np
+def repeat_vmap(fun, in_axes=[0]):
+    for axes in in_axes:
+        fun = jax.vmap(fun, in_axes=axes)
+    return fun
+def make_grid(patch_size: int | tuple[int, int]):
+    if isinstance(patch_size, int):
+        patch_size = (patch_size, patch_size)
+    offset_h, offset_w = 1 / (2 * np.array(patch_size))
+    space_h = np.linspace(-0.5 + offset_h, 0.5 - offset_h, patch_size[0])
+    space_w = np.linspace(-0.5 + offset_w, 0.5 - offset_w, patch_size[1])
+    return np.stack(np.meshgrid(space_h, space_w, indexing='ij'), axis=-1)  # [h, w]
+def interpolate_grid(coords, grid, order=0):
+    """
+    args:
+        coords: Tensor of shape (B, H, W, 2) with coordinates in [-0.5, 0.5]
+        grid: Tensor of shape (B, H', W', C)
+    returns:
+        Tensor of shape (B, H, W, C) with interpolated values
+    """
+    # convert [-0.5, 0.5] -> [0, size], where pixel centers are expected at
+    # [-0.5 + 1 / (2*size), ..., 0.5 - 1 / (2*size)]
+    coords = coords.transpose((0, 3, 1, 2))
+    coords = coords.at[:, 0].set(coords[:, 0] * grid.shape[-3] + (grid.shape[-3] - 1) / 2)
+    coords = coords.at[:, 1].set(coords[:, 1] * grid.shape[-2] + (grid.shape[-2] - 1) / 2)
+    map_coordinates = partial(jax.scipy.ndimage.map_coordinates, order=order, mode='nearest')
+    return jax.vmap(jax.vmap(map_coordinates, in_axes=(2, None), out_axes=2))(grid, coords)