Spaces:

johannesschmude
/

surya_visual_forecasting_demo

Runtime error

App Files Files Community

johannesschmude commited on 3 days ago

Commit

b73936d

1 Parent(s): 3808ef8

Initial commit

Browse files

Files changed (19) hide show

app.py +325 -0
requirements.txt +17 -0
surya/__init__.py +0 -0
surya/datasets/__init__.py +0 -0
surya/datasets/helio.py +524 -0
surya/datasets/transformations.py +456 -0
surya/models/__init__.py +0 -0
surya/models/embedding.py +483 -0
surya/models/flow.py +81 -0
surya/models/helio_spectformer.py +318 -0
surya/models/spectformer.py +305 -0
surya/models/transformer_ls.py +369 -0
surya/utils/__init__.py +0 -0
surya/utils/config.py +311 -0
surya/utils/data.py +176 -0
surya/utils/distributed.py +313 -0
surya/utils/log.py +110 -0
surya/utils/misc.py +90 -0
surya/utils/schemas.py +14 -0

app.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import socket
+import yaml
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import matplotlib.pyplot as plt
+import sunpy.visualization.colormaps as sunpy_cm
+import gradio as gr
+from huggingface_hub import snapshot_download
+from surya.datasets.helio import HelioNetCDFDataset, inverse_transform_single_channel
+from surya.models.helio_spectformer import HelioSpectFormer
+from surya.utils.data import build_scalers, custom_collate_fn
+logger = logging.getLogger(__name__)
+SDO_CHANNELS = [
+    "aia94",
+    "aia131",
+    "aia171",
+    "aia193",
+    "aia211",
+    "aia304",
+    "aia335",
+    "aia1600",
+    "hmi_m",
+    "hmi_bx",
+    "hmi_by",
+    "hmi_bz",
+    "hmi_v",
+]
+@dataclass
+class SDOImage:
+    channel: str
+    data: np.ndarray
+    timestamp: str
+    type: str
+def download_data():
+    snapshot_download(
+        repo_id="nasa-ibm-ai4science/Surya-1.0",
+        local_dir="data/Surya-1.0",
+        allow_patterns=["config.yaml", "scalers.yaml", "surya.366m.v1.pt"],
+        token=None,
+    )
+    snapshot_download(
+        repo_id="nasa-ibm-ai4science/Surya-1.0_validation_data",
+        repo_type="dataset",
+        local_dir="data/Surya-1.0_validation_data",
+        allow_patterns="20140107_1[5-9]??.nc",
+        token=None,
+    )
+def get_dataset(config, scalers) -> HelioNetCDFDataset:
+    dataset = HelioNetCDFDataset(
+        index_path="tests/test_surya_index.csv",
+        time_delta_input_minutes=config["data"]["time_delta_input_minutes"],
+        time_delta_target_minutes=config["data"]["time_delta_target_minutes"],
+        n_input_timestamps=len(config["data"]["time_delta_input_minutes"]),
+        rollout_steps=0,
+        channels=config["data"]["sdo_channels"],
+        drop_hmi_probability=config["data"]["drop_hmi_probability"],
+        num_mask_aia_channels=config["data"]["num_mask_aia_channels"],
+        use_latitude_in_learned_flow=config["data"]["use_latitude_in_learned_flow"],
+        scalers=scalers,
+        phase="valid",
+        pooling=config["data"]["pooling"],
+        random_vert_flip=config["data"]["random_vert_flip"],
+    )
+    logger.info(f"Initialized the dataset. {len(dataset)} samples.")
+    return dataset
+def get_scalers() -> dict:
+    scalers_info = yaml.safe_load(open("data/Surya-1.0/scalers.yaml", "r"))
+    scalers = build_scalers(info=scalers_info)
+    logger.info("Built the scalers.")
+    return scalers
+def get_model_from_config(config) -> HelioSpectFormer:
+    model = HelioSpectFormer(
+        img_size=config["model"]["img_size"],
+        patch_size=config["model"]["patch_size"],
+        in_chans=len(config["data"]["sdo_channels"]),
+        embed_dim=config["model"]["embed_dim"],
+        time_embedding={
+            "type": "linear",
+            "time_dim": len(config["data"]["time_delta_input_minutes"]),
+        },
+        depth=config["model"]["depth"],
+        n_spectral_blocks=config["model"]["n_spectral_blocks"],
+        num_heads=config["model"]["num_heads"],
+        mlp_ratio=config["model"]["mlp_ratio"],
+        drop_rate=config["model"]["drop_rate"],
+        dtype=torch.bfloat16,
+        window_size=config["model"]["window_size"],
+        dp_rank=config["model"]["dp_rank"],
+        learned_flow=config["model"]["learned_flow"],
+        use_latitude_in_learned_flow=config["model"]["learned_flow"],
+        init_weights=False,
+        checkpoint_layers=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        rpe=config["model"]["rpe"],
+        ensemble=config["model"]["ensemble"],
+        finetune=config["model"]["finetune"],
+    )
+    logger.info("Initialized the model.")
+    return model
+def get_config() -> dict:
+    with open("data/Surya-1.0/config.yaml") as fp:
+        config = yaml.safe_load(fp)
+    return config
+def setup():
+    logger.info("Loading data ...")
+    download_data()
+    config = get_config()
+    scalers = get_scalers()
+    logger.info("Initializing dataset ...")
+    dataset = get_dataset(config, scalers)
+    logger.info("Initializing model ...")
+    model = get_model_from_config(config)
+    if torch.cuda.is_available():
+        device = torch.cuda.current_device()
+        logger.info(f"GPU detected. Running the test on device {device}.")
+    else:
+        device = "cpu"
+        logger.warning(f"No GPU detected. Running the test on CPU.")
+    model.to(device)
+    n_parameters = sum(p.numel() for p in model.parameters()) / 1e6
+    logger.info(f"Surya FM: {n_parameters:.2f} M total parameters.")
+    path_weights = "data/Surya-1.0/surya.366m.v1.pt"
+    weights = torch.load(
+        path_weights, map_location=torch.device(device), weights_only=True
+    )
+    model.load_state_dict(weights, strict=True)
+    logger.info("Loaded weights.")
+    return dataset, model, device
+def batch_step(
+    model: HelioSpectFormer,
+    sample_data: dict,
+    sample_metadata: dict,
+    device: int | str,
+    hours_ahead: int = 1,
+) -> np.ndarray:
+    """
+    Perform a single batch step for the given model, batch data, metadata, and device.
+    Args:
+        model: The PyTorch model to use for prediction.
+        sample_data: A dictionary containing input and target data for the batch.
+        sample_metadata: A dictionary containing metadata for the batch, including timestamps.
+        device: The device to use for computation ('cpu', 'cuda' or device number).
+        hours_ahead: The number of steps to forecast ahead. Defaults to 1.
+    Returns:
+        np.ndarray: Output data.
+    """
+    data_returned = []
+    forecast_hat = None  # Initialize forecast_hat
+    for step in range(1, hours_ahead + 1):
+        if step == 1:
+            curr_batch = {
+                key: torch.from_numpy(sample_data[key]).unsqueeze(0).to(device)
+                for key in ["ts", "time_delta_input"]
+            }
+        else:
+            # Use the previous forecast_hat from the previous iteration
+            if forecast_hat is not None:
+                curr_batch["ts"] = torch.cat(
+                    (curr_batch["ts"][:, :, 1:, ...], forecast_hat[:, :, None, ...]),
+                    dim=2,
+                )
+        forecast_hat = model(curr_batch)
+    data_returned = forecast_hat.to(dtype=torch.float32).cpu().squeeze(0).numpy()
+    return data_returned
+def run_inference(init_time_idx, plt_channel_idx, hours_ahead):
+    plt_channel_str = SDO_CHANNELS[plt_channel_idx]
+    input_timestamp_1 = dataset.valid_indices[init_time_idx]
+    input_timestamp_0 = input_timestamp_1 - pd.Timedelta(1, "h")
+    output_timestamp = input_timestamp_1 + pd.Timedelta(int(hours_ahead), "h")
+    input_timestamp_0 = input_timestamp_0.strftime("%Y-%m-%d %H:%M")
+    input_timestamp_1 = input_timestamp_1.strftime("%Y-%m-%d %H:%M")
+    output_timestamp = output_timestamp.strftime("%Y-%m-%d %H:%M")
+    sample_data, sample_metadata = dataset[init_time_idx]
+    with torch.no_grad():
+        model_output = batch_step(
+            model,
+            sample_data,
+            sample_metadata,
+            device,
+            hours_ahead
+        )
+    means, stds, epsilons, sl_scale_factors = dataset.transformation_inputs()
+    vmin = float("-inf")
+    vmax = float("inf")
+    input_image = []
+    for i in range(2):
+        input_image.append(
+            inverse_transform_single_channel(
+                sample_data["ts"][plt_channel_idx, i],
+                mean=means[plt_channel_idx],
+                std=stds[plt_channel_idx],
+                epsilon=epsilons[plt_channel_idx],
+                sl_scale_factor=sl_scale_factors[plt_channel_idx],
+            )
+        )
+        vmin = max(vmin, sample_data["ts"][plt_channel_idx, i].min())
+        #vmax = min(vmax, np.quantile(sample_data["ts"][plt_channel_idx, i], 0.99))
+        vmax = min(vmax, sample_data["ts"][plt_channel_idx, i].max())
+    if plt_channel_str.startswith("aia"):
+        cm_name = "sdo" + plt_channel_str
+    else:
+        cm_name = "hmimag"
+    input_image = [
+        sunpy_cm.cmlist[cm_name](
+            (img[::-1]-vmin) / (vmax-vmin), bytes=True
+        )
+        for img in input_image
+    ]
+    output_image = inverse_transform_single_channel(
+        model_output[plt_channel_idx],
+        mean=means[plt_channel_idx],
+        std=stds[plt_channel_idx],
+        epsilon=epsilons[plt_channel_idx],
+        sl_scale_factor=sl_scale_factors[plt_channel_idx],
+    )
+    output_image = sunpy_cm.cmlist[cm_name](
+        (output_image[::-1]-vmin) / (vmax-vmin), bytes=True
+    )
+    return input_timestamp_0, input_image[0], input_timestamp_1, input_image[1], output_timestamp, output_image
+logging.basicConfig(level=logging.INFO)
+dataset, model, device = setup()
+hostname = socket.getfqdn()
+logging.info(f"Launching app on {hostname}")
+with gr.Blocks() as demo:
+    gr.Markdown(value="# Surya 1.0 - Visual forecasting demo")
+    #with gr.Row():
+    #with gr.Column():
+    with gr.Row():
+        with gr.Column():
+            init_time = gr.Dropdown(
+                [v.strftime("%Y-%m-%d %H:%M") for v in dataset.valid_indices],
+                label="Initialization time",
+                multiselect=False,
+                type="index",
+            )
+        with gr.Column():
+            plt_channel = gr.Dropdown(
+                [c.upper() for c in SDO_CHANNELS],
+                label="SDO Band",
+                value="AIA94",
+                multiselect=False,
+                type="index"
+            )
+    with gr.Row():
+        hours_ahead = gr.Slider(minimum=1.0, maximum=6.0, step=1.0, label="Forcast step [hours ahead]")
+    with gr.Row():
+        btn = gr.Button("Run")
+    with gr.Row():
+        with gr.Column():
+            input_timestamp_0 = gr.Textbox(label="Input 0")
+            input_image_0 = gr.Image()
+        with gr.Column():
+            input_timestamp_1 = gr.Textbox(label="Input 1")
+            input_image_1 = gr.Image()
+        with gr.Column():
+            output_timestamp = gr.Textbox(label="Prediction")
+            output_image = gr.Image()
+    btn.click(
+        fn=run_inference,
+        inputs=[init_time, plt_channel, hours_ahead],
+        outputs=[input_timestamp_0, input_image_0, input_timestamp_1, input_image_1, output_timestamp, output_image]
+    )
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                ["2014-01-07 17:24", "AIA94", 2],
+                ["2014-01-07 16:12", "AIA94", 6],
+                ["2014-01-07 16:00", "AIA131", 1],
+                ["2014-01-07 16:00", "HMI_M", 2],
+            ],
+            fn=run_inference,
+            inputs=[init_time, plt_channel, hours_ahead],
+            outputs=[input_timestamp_0, input_image_0, input_timestamp_1, input_image_1, output_timestamp, output_image],
+            cache_examples=False,
+        )
+demo.launch(server_name=hostname, server_port=None)

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+einops==0.8.1
+gradio==5.43.1
+hdf5plugin==5.1.0
+huggingface_hub==0.34.4
+matplotlib==3.10.5
+numba==0.61.2
+numpy==2.3.2
+packaging==25.0
+pandas==2.3.1
+PyYAML==6.0.2
+PyYAML==6.0.2
+skimage==0.0
+sunpy==6.1.1
+timm==1.0.19
+torch==2.6.0
+wandb==0.21.1
+xarray==2025.3.1

surya/__init__.py ADDED Viewed

File without changes

surya/datasets/__init__.py ADDED Viewed

File without changes

surya/datasets/helio.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import os
+import sys
+import random
+from datetime import datetime
+import torch
+import numpy as np
+import skimage.measure
+import xarray as xr
+import pandas as pd
+from logging import Logger
+from torch.utils.data import Dataset
+from surya.utils.distributed import get_rank
+from surya.utils.log import create_logger
+from functools import cache
+from numba import njit, prange
+import hdf5plugin
+@njit(parallel=True)
+def fast_transform(data, means, stds, sl_scale_factors, epsilons):
+    """
+    Implements signum log transform using numba for speed
+    Notes:
+    - This must reside outside the class definition from which it is called.
+    - We used this function during pretraining for faster data loading. On select
+      GPU clusters it leads to the system hanging however when data loading happens
+      outside the GPU thread. See below for a non-numba-enhanced version.
+    Args:
+        data: Numpy array of shape C, H, W
+        means: Numpy array of shape C. Mean per channel.
+        stds: Numpy array of shape C. Standard deviation per channel.
+        sl_scale_factors: Numpy array of shape C. Signum-log scale factors.
+        epsilons: Numpy array of shape C. Constant to avoid zero division.
+    Returns:
+        Numpy array of shape C, H, W.
+    """
+    C, H, W = data.shape
+    out = np.empty((C, H, W), dtype=np.float32)
+    for c in prange(C):
+        mean = means[c]
+        std = stds[c]
+        eps = epsilons[c]
+        sl_scale_factor = sl_scale_factors[c]
+        for i in range(H):
+            for j in range(W):
+                val = data[c, i, j]
+                val = val * sl_scale_factor
+                if val >= 0:
+                    val = np.log1p(val)
+                else:
+                    val = -np.log1p(-val)
+                out[c, i, j] = (val - mean) / (std + eps)
+    return out
+def transform(
+        data: np.ndarray,
+        means: np.ndarray,
+        stds: np.ndarray,
+        sl_scale_factors: np.ndarray,
+        epsilons: np.ndarray
+    ) -> np.ndarray:
+    """
+    Implements signum log transform. Drop-in replacement for
+    `fast_transform` method above.
+    Args:
+        data: Numpy array of shape C, H, W
+        means: Numpy array of shape C. Mean per channel.
+        stds: Numpy array of shape C. Standard deviation per channel.
+        sl_scale_factors: Numpy array of shape C. Signum-log scale factors.
+        epsilons: Numpy array of shape C. Constant to avoid zero division.
+    Returns:
+        Numpy array of shape C, H, W.
+    """
+    means = means.reshape(*means.shape, 1, 1)
+    stds = stds.reshape(*stds.shape, 1, 1)
+    sl_scale_factors = sl_scale_factors.reshape(*sl_scale_factors.shape, 1, 1)
+    epsilons = epsilons.reshape(*epsilons.shape, 1, 1)
+    data = data * sl_scale_factors
+    data = np.sign(data) * np.log1p(np.abs(data))
+    data = (data - means) / (stds + epsilons)
+    return data
+@njit(parallel=True)
+def inverse_fast_transform(data, means, stds, sl_scale_factors, epsilons):
+    """
+    Implements inverse signum log transform using numba for speed
+    Args:
+        data: Numpy array of shape C, H, W
+        means: Numpy array of shape C. Mean per channel.
+        stds: Numpy array of shape C. Standard deviation per channel.
+        sl_scale_factors: Numpy array of shape C. Signum-log scale factors.
+        epsilons: Numpy array of shape C. Constant to avoid zero division.
+    Returns:
+        Numpy array of shape C, H, W.
+    """
+    C, H, W = data.shape
+    out = np.empty((C, H, W), dtype=np.float32)
+    for c in prange(C):
+        mean = means[c]
+        std = stds[c]
+        eps = epsilons[c]
+        sl_scale_factor = sl_scale_factors[c]
+        for i in range(H):
+            for j in range(W):
+                val = data[c, i, j]
+                val = val * (std + eps) + mean
+                if val >= 0:
+                    val = np.expm1(val)
+                else:
+                    val = -np.expm1(-val)
+                val = val / sl_scale_factor
+                out[c, i, j] = val
+    return out
+def inverse_transform_single_channel(data, mean, std, sl_scale_factor, epsilon):
+    """
+    Implements inverse signum log transform.
+    Args:
+        data: Numpy array of shape C, H, W
+        means: Numpy array of shape C. Mean per channel.
+        stds: Numpy array of shape C. Standard deviation per channel.
+        sl_scale_factors: Numpy array of shape C. Signum-log scale factors.
+        epsilons: Numpy array of shape C. Constant to avoid zero division.
+    Returns:
+        Numpy array of shape C, H, W.
+    """
+    data = data * (std + epsilon) + mean
+    data = np.sign(data) * np.expm1(np.abs(data))
+    data = data / sl_scale_factor
+    return data
+class RandomChannelMaskerTransform:
+    def __init__(
+        self, num_channels, num_mask_aia_channels, phase, drop_hmi_probability
+    ):
+        """
+        Initialize the RandomChannelMaskerTransform class as a transform.
+        Args:
+        - num_channels: Total number of channels in the input (3rd dimension of
+          the tensor).
+        - num_mask_aia_channels: Number of channels to randomly mask.
+        """
+        self.num_channels = num_channels
+        self.num_mask_aia_channels = num_mask_aia_channels
+        self.drop_hmi_probability = drop_hmi_probability
+    def __call__(self, input_tensor):
+        C, T, H, W = input_tensor.shape  # Unpacking the correct 5 dimensions
+        # Randomly select channels to mask
+        channels_to_mask = random.sample(range(C), self.num_mask_aia_channels)
+        # Create an in-place mask of shape [1, 1, num_channels, 1, 1]
+        mask = torch.ones((C, 1, 1, 1))
+        mask[channels_to_mask, ...] = 0  # Set selected channels to zero
+        # Apply the mask in-place for memory efficiency
+        masked_tensor = input_tensor * mask  # Modify input_tensor directly
+        if self.drop_hmi_probability > random.random():
+            masked_tensor[-1, ...] = 0
+        return masked_tensor
+class HelioNetCDFDataset(Dataset):
+    """
+    PyTorch dataset to load a curated dataset from the NASA Solar Dynamics
+    Observatory (SDO) mission stored as NetCDF files, with handling for variable timesteps.
+    Internally maintains two databases. The first is `self.index`. This takes the
+    form
+                                                                        path  present
+        timestep
+        2011-01-01 00:00:00  /lustre/fs0/scratch/shared/data/2011/01/Arka_2...        1
+        2011-01-01 00:12:00  /lustre/fs0/scratch/shared/data/2011/01/Arka_2...        1
+        ...                                                                ...      ...
+        2012-11-30 23:48:00  /lustre/fs0/scratch/shared/data/2012/11/Arka_2...        1
+    The second is `self.valid_indices`. This is simply a list of timesteps -- entries
+    in the index of `self.index` -- which define valid samples. A sample is valid
+    when all timestamps that can be reached by entris in
+    time_delta_input_minutes and time_delta_target_minutes can be reached from it
+    are present.
+    """
+    def __init__(
+        self,
+        index_path: str,
+        time_delta_input_minutes: list[int],
+        time_delta_target_minutes: int,
+        n_input_timestamps: int,
+        rollout_steps: int,
+        scalers=None,
+        num_mask_aia_channels: int = 0,
+        drop_hmi_probability: float = 0.0,
+        use_latitude_in_learned_flow=False,
+        channels: list[str] | None = None,
+        phase="train",
+        pooling: int | None = None,
+        random_vert_flip: bool = False,
+    ):
+        self.scalers = scalers
+        self.phase = phase
+        self.channels = channels
+        self.num_mask_aia_channels = num_mask_aia_channels
+        self.drop_hmi_probability = drop_hmi_probability
+        self.n_input_timestamps = n_input_timestamps
+        self.rollout_steps = rollout_steps
+        self.use_latitude_in_learned_flow = use_latitude_in_learned_flow
+        self.pooling = pooling if pooling is not None else 1
+        self.random_vert_flip = random_vert_flip
+        if self.channels is None:
+            # AIA + HMI channels
+            self.channels = [
+                "0094",
+                "0131",
+                "0171",
+                "0193",
+                "0211",
+                "0304",
+                "0335",
+                "hmi",
+            ]
+        self.in_channels = len(self.channels)
+        self.masker = RandomChannelMaskerTransform(
+            num_channels=self.in_channels,
+            num_mask_aia_channels=self.num_mask_aia_channels,
+            phase=self.phase,
+            drop_hmi_probability=self.drop_hmi_probability,
+        )
+        # Convert time delta to numpy timedelta64
+        self.time_delta_input_minutes = sorted(
+            np.timedelta64(t, "m") for t in time_delta_input_minutes
+        )
+        self.time_delta_target_minutes = [
+            np.timedelta64(iroll * time_delta_target_minutes, "m")
+            for iroll in range(1, rollout_steps + 2)
+        ]
+        # Create the index
+        self.index = pd.read_csv(index_path)
+        self.index = self.index[self.index["present"] == 1]
+        self.index["timestep"] = pd.to_datetime(self.index["timestep"]).values.astype(
+            "datetime64[ns]"
+        )
+        self.index.set_index("timestep", inplace=True)
+        self.index.sort_index(inplace=True)
+        # Filter out rows where the sequence is not fully present
+        self.valid_indices = self.filter_valid_indices()
+        self.adjusted_length = len(self.valid_indices)
+        self.rank = get_rank()
+        self.logger: Logger | None = None
+    def create_logger(self):
+        """
+        Creates a logger attached to self.logger.
+        The logger is identified by SLURM job ID
+        as well as the data processes rank and process ID.
+        """
+        os.makedirs("logs/data", exist_ok=True)
+        timestamp = datetime.now().strftime("%Y%m%dT%H%M%SZ")
+        pid = os.getpid()
+        self.logger = create_logger(
+            output_dir="logs/data",
+            dist_rank=self.rank,
+            name=f"{timestamp}_{self.rank:>03}_data_{self.phase}_{pid}",
+        )
+    def filter_valid_indices(self):
+        """
+        Extracts timestamps from the index of self.index that define valid
+        samples.
+        Args:
+        Returns:
+            List of timestamps.
+        """
+        valid_indices = []
+        time_deltas = np.unique(
+            self.time_delta_input_minutes + self.time_delta_target_minutes
+        )
+        for reference_timestep in self.index.index:
+            required_timesteps = reference_timestep + time_deltas
+            if all(t in self.index.index for t in required_timesteps):
+                valid_indices.append(reference_timestep)
+        return valid_indices
+    def __len__(self):
+        return self.adjusted_length
+    def __getitem__(self, idx: int) -> dict:
+        """
+        Args:
+            idx: Index of sample to load. (Pytorch standard.)
+        Returns:
+            Dictionary with following keys. The values are tensors with shape as follows:
+                ts (torch.Tensor):                C, T, H, W
+                time_delta_input (torch.Tensor):  T
+                input_latitude (torch.Tensor):    T
+                forecast (torch.Tensor):          C, L, H, W
+                lead_time_delta (torch.Tensor):   L
+                forecast_latitude (torch.Tensor): L
+            C - Channels, T - Input times, H - Image height, W - Image width, L - Lead time.
+        """
+        if self.logger is None:
+            self.create_logger()
+            self.logger.info(f"HelioNetCDFDataset of length {self.__len__()}.")
+        exception_counter = 0
+        max_exception = 100
+        self.logger.info(f"Starting to retrieve index {idx}.")
+        while True:
+            try:
+                sample = self._get_index_data(idx)
+            except Exception as e:
+                exception_counter += 1
+                if exception_counter >= max_exception:
+                    raise e
+                reference_timestep = self.valid_indices[idx]
+                self.logger.warning(
+                    f"Failed retrieving index {idx}. Timestamp {reference_timestep}. Attempt {exception_counter}."
+                )
+                idx = (idx + 1) % self.__len__()
+            else:
+                self.logger.info(f"Returning index {idx}.")
+                return sample
+    def _get_index_data(self, idx: int) -> dict:
+        """
+        Args:
+            idx: Index of sample to load. (Pytorch standard.)
+        Returns:
+            Dictionary with following keys. The values are tensors with shape as follows:
+                ts (torch.Tensor):                C, T, H, W
+                time_delta_input (torch.Tensor):  T
+                input_latitude (torch.Tensor):    T
+                forecast (torch.Tensor):          C, L, H, W
+                lead_time_delta (torch.Tensor):   L
+                forecast_latitude (torch.Tensor): L
+            C - Channels, T - Input times, H - Image height, W - Image width, L - Lead time.
+        """
+        # start_time = time.time()
+        time_deltas = np.array(
+            sorted(
+                random.sample(
+                    self.time_delta_input_minutes[:-1], self.n_input_timestamps - 1
+                )
+            )
+            + [self.time_delta_input_minutes[-1]]
+            + self.time_delta_target_minutes
+        )
+        reference_timestep = self.valid_indices[idx]
+        required_timesteps = reference_timestep + time_deltas
+        sequence_data = [
+            self.transform_data(
+                self.load_nc_data(
+                    self.index.loc[timestep, "path"], timestep, self.channels
+                )
+            )
+            for timestep in required_timesteps
+        ]
+        # Split sequence_data into inputs and target
+        inputs = sequence_data[: -self.rollout_steps - 1]
+        targets = sequence_data[-self.rollout_steps - 1 :]
+        stacked_inputs = np.stack(inputs, axis=1)
+        stacked_targets = np.stack(targets, axis=1)
+        timestamps_input = required_timesteps[: -self.rollout_steps - 1]
+        timestamps_targets = required_timesteps[-self.rollout_steps - 1 :]
+        if self.num_mask_aia_channels > 0 or self.drop_hmi_probability:
+            # assert 0 < self.num_mask_aia_channels < self.in_channels, \
+            #     f'num_mask_aia_channels = {self.num_mask_aia_channels} should lie between 0 and {self.in_channels}'
+            stacked_inputs = self.masker(stacked_inputs)
+        time_delta_input_float = (
+            time_deltas[-self.rollout_steps - 2]
+            - time_deltas[: -self.rollout_steps - 1]
+        ) / np.timedelta64(1, "h")
+        time_delta_input_float = time_delta_input_float.astype(np.float32)
+        lead_time_delta_float = (
+            time_deltas[-self.rollout_steps - 2]
+            - time_deltas[-self.rollout_steps - 1 :]
+        ) / np.timedelta64(1, "h")
+        lead_time_delta_float = lead_time_delta_float.astype(np.float32)
+        # print('LocalRank', int(os.environ["LOCAL_RANK"]),
+        #       'GlobalRank', int(os.environ["RANK"]),
+        #       'worker', torch.utils.data.get_worker_info().id,
+        #       f': Processed Input: {idx} ',time.time()- start_time)
+        metadata = {
+            "timestamps_input": timestamps_input,
+            "timestamps_targets": timestamps_targets,
+        }
+        if self.random_vert_flip:
+            if torch.bernoulli(torch.ones(()) / 2) == 1:
+                stacked_inputs = torch.flip(stacked_inputs, dims=-2)
+                stacked_targets = torch.flip(stacked_inputs, dims=-2)
+        if self.use_latitude_in_learned_flow:
+            from sunpy.coordinates.ephemeris import get_earth
+            sequence_latitude = [
+                get_earth(timestep).lat.value for timestep in required_timesteps
+            ]
+            input_latitudes = sequence_latitude[: -self.rollout_steps - 1]
+            target_latitude = sequence_latitude[-self.rollout_steps - 1 :]
+            return {
+                "ts": stacked_inputs,
+                "time_delta_input": time_delta_input_float,
+                "input_latitudes": input_latitudes,
+                "forecast": stacked_targets,
+                "lead_time_delta": lead_time_delta_float,
+                "forecast_latitude": target_latitude,
+            }, metadata
+        return {
+            "ts": stacked_inputs,
+            "time_delta_input": time_delta_input_float,
+            "forecast": stacked_targets,
+            "lead_time_delta": lead_time_delta_float,
+        }, metadata
+    def load_nc_data(
+        self, filepath: str, timestep: pd.Timestamp, channels: list[str]
+    ) -> np.ndarray:
+        """
+        Args:
+            filepath: String or Pathlike. Points to NetCDF file to open.
+            timestep: Identifies timestamp to retrieve.
+        Returns:
+            Numpy array of shape (C, H, W).
+        """
+        self.logger.info(f"Reading file {filepath}.")
+        with xr.open_dataset(
+            filepath, engine="h5netcdf", chunks=None, cache=False,
+        ) as ds:
+            data = ds[channels].to_array().load().to_numpy()
+        return data
+    @cache
+    def transformation_inputs(self) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray):
+        means = np.array([self.scalers[ch].mean for ch in self.channels])
+        stds = np.array([self.scalers[ch].std for ch in self.channels])
+        epsilons = np.array([self.scalers[ch].epsilon for ch in self.channels])
+        sl_scale_factors = np.array(
+            [self.scalers[ch].sl_scale_factor for ch in self.channels]
+        )
+        return means, stds, epsilons, sl_scale_factors
+    def transform_data(self, data: np.ndarray) -> np.ndarray:
+        """
+        Applies scalers.
+        Args:
+            data: Numpy array of shape (C, H, W)
+        Returns:
+            Tensor of shape (C, H, W). Data type float32.
+        Uses:
+                numba to speed up transform
+                tvk-srm-heliofm  environment cloned from srm-heliofm with numba added
+                tvk_dgx_slurm.sh  shell script modified to use new environment and new jobname
+                train_spectformer_dgx.yaml new jobname
+        """
+        assert data.ndim == 3
+        if self.pooling > 1:
+            data = skimage.measure.block_reduce(
+                data, block_size=(1, self.pooling, self.pooling), func=np.mean
+            )
+        means, stds, epsilons, sl_scale_factors = self.transformation_inputs()
+        result_np = transform(data, means, stds, sl_scale_factors, epsilons)
+        return result_np

surya/datasets/transformations.py ADDED Viewed

	@@ -0,0 +1,456 @@

+import abc
+from logging import info
+from typing import Tuple
+import numpy as np
+import torch
+import xarray as xr
+class Transformation(object):
+    @abc.abstractmethod
+    def fit(self, data: xr.DataArray):
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def transform(self, data: xr.DataArray):
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def inverse_transform(self, data: xr.DataArray):
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def fit_transform(self, data: xr.DataArray):
+        return self.fit(data).transform(data)
+    @abc.abstractmethod
+    def to_dict(self) -> dict:
+        raise NotImplementedError()
+    @staticmethod
+    @abc.abstractmethod
+    def from_dict(info: dict):
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def reset(self):
+        raise NotImplementedError()
+class MinMaxScaler(Transformation):
+    """_summary_
+    Minmax scaling on the entire data
+    """
+    def __init__(self, new_min=1, new_max=2):
+        self._is_fitted = False
+        self.new_min = new_min
+        self.new_max = new_max
+        self._min = None
+        self._max = None
+    @property
+    def min(self) -> float:
+        return self._min
+    @property
+    def max(self) -> float:
+        return self._max
+    @property
+    def is_fitted(self) -> bool:
+        return self._is_fitted
+    def fit(self, data: xr.DataArray):
+        if not self.is_fitted:
+            self._max = data.max().values
+            self._min = data.min().values
+            self._is_fitted = True
+        else:
+            info("Already fitted, skipping function.")
+        return self
+    def _transform(self, data: xr.DataArray):
+        return (
+            ((data - self.min) / (self.max - self.min)) * (self.new_max - self.new_min)
+        ) + self.new_min
+    def transform(self, data: xr.DataArray) -> xr.DataArray:
+        assert self.min is not None and self.max is not None, "You must run fit first."
+        data = xr.apply_ufunc(self._transform, data, dask="forbidden")
+        return data
+    def fit_transform(self, data):
+        self.fit(data)
+        return self.transform(data)
+    def inverse_transform(self, data):
+        return data * (self.max - self.min) + self.min
+    def to_dict(self) -> dict:
+        out_dict = {
+            "base": self.__module__,
+            "class": self.__class__.__name__,
+            "new_min": str(self.new_min),
+            "new_max": str(self.new_max),
+            "min": str(self.min),
+            "max": str(self.max),
+            "is_fitted": self.is_fitted,
+        }
+        return out_dict
+    @staticmethod
+    def from_dict(info: dict):
+        # with open(yaml_path, 'r') as file:
+        #     data = yaml.load(file, Loader=yaml.SafeLoader)
+        out = MinMaxScaler(
+            new_min=np.float32(info["new_min"]), new_max=np.float32(info["new_max"])
+        )
+        out._min = np.float32(info["min"])
+        out._max = np.float32(info["max"])
+        out._is_fitted = info["is_fitted"]
+        return out
+    def reset(self):
+        self.__init__(self.new_min, self.new_max)
+    def __str__(self):
+        return (
+            f"min: {self.min}, "
+            f"max: {self.max}, "
+            f"new_max: {self.new_max}, "
+            f"new_min: {self.new_min}"
+        )
+class StandardScaler(Transformation):
+    """_summary_
+    Standard scaling on the entire data
+    """
+    def __init__(self, epsilon=1e-8):
+        self.epsilon = epsilon
+        self._is_fitted = False
+        self._mean = None
+        self._std = None
+        self._min = None
+        self._max = None
+        self._sl_scale_factor = None
+    @property
+    def mean(self) -> float:
+        return self._mean
+    @property
+    def std(self) -> float:
+        return self._std
+    @property
+    def min(self) -> float:
+        return self._min
+    @property
+    def max(self) -> float:
+        return self._max
+    @property
+    def sl_scale_factor(self) -> float:
+        return self._sl_scale_factor
+    @property
+    def is_fitted(self) -> bool:
+        return self._is_fitted
+    def fit(self, data):
+        if not self.is_fitted:
+            self._mean = data.mean().values
+            self._std = data.std().values
+            self._min = data.min().values
+            self._max = data.max().values
+            self._is_fitted = True
+        else:
+            info("Already fitted, skipping function.")
+        return self
+    def _transform(self, data: xr.DataArray):
+        return (data - self.mean) / (self.std + self.epsilon)
+    def _signum_log_transform(self, data: xr.DataArray):
+        data = data * self.sl_scale_factor
+        return np.sign(data) * np.log1p(np.abs(data))
+    def signum_log_transform(self, data: xr.DataArray):
+        assert self.mean is not None and self.std is not None, "You must run fit first."
+        data = xr.apply_ufunc(self._signum_log_transform, data, dask="forbidden")
+        data = xr.apply_ufunc(self._transform, data, dask="forbidden")
+        return data
+    def transform(self, data: xr.DataArray):
+        assert self.mean is not None and self.std is not None, "You must run fit first."
+        data = xr.apply_ufunc(self._transform, data, dask="forbidden")
+        return data
+    def fit_transform(self, data: xr.DataArray):
+        self.fit(data)
+        return self.transform(data)
+    def inverse_transform(self, data):
+        if isinstance(data, torch.Tensor):
+            return data * (
+                torch.Tensor([self.std]).to(data.device)
+                + torch.Tensor([self.epsilon]).to(data.device)
+            ) + torch.Tensor([self.mean]).to(data.device)
+        else:
+            return data * (self.std + self.epsilon) + self.mean
+    def inverse_signum_log_transform(self, data):
+        if isinstance(data, torch.Tensor):
+            return (
+                torch.sign(data)
+                * torch.expm1(torch.abs(data))
+                / torch.Tensor([self.sl_scale_factor]).to(data.device)
+            )
+        else:
+            return np.sign(data) * np.expm1(np.abs(data)) / self.sl_scale_factor
+    def to_dict(self) -> dict:
+        return {
+            "base": self.__module__,
+            "class": self.__class__.__name__,
+            "epsilon": str(self.epsilon),
+            "mean": str(self.mean),
+            "std": str(self.std),
+            "is_fitted": self.is_fitted,
+            "min": str(self.min),
+            "max": str(self.max),
+            "sl_scale_factor": str(self.sl_scale_factor),
+        }
+    @staticmethod
+    def from_dict(info: dict):
+        out = StandardScaler(epsilon=np.float32(info["epsilon"]))
+        out._mean = np.float32(info["mean"])
+        out._std = np.float32(info["std"])
+        out._is_fitted = info["is_fitted"]
+        out._min = np.float32(info["min"])
+        out._max = np.float32(info["max"])
+        out._sl_scale_factor = np.float32(info["sl_scale_factor"])
+        return out
+    def reset(self):
+        self.__init__(self.epsilon)
+    def __str__(self):
+        return f"mean: {self.mean}, " f"std: {self.std}, " f"epsilon: {self.epsilon}"
+class MaskUnits2D:
+    """
+    Transformation that takes a tuple of numpy tensors and returns a sequence of mask units. These are generally in the form `channel, dim_0, dim_1, dim_2, ...`. The returned data is largely of shape `mask unit sequence, channel, lat, lon`. Masked patches are not returned.
+    The return values contain sets of indices. The indices indicate which mask units where dropped (masked) or not. The 1D indexing here simply relies on flattening the 2D space of mask units. The class methods `reconstruct` and `reconstruct_batch` show how to re-assemble the entire sequence.
+    """
+    def __init__(
+        self,
+        n_lat_mu: int,
+        n_lon_mu: int,
+        padding,
+        seed=None,
+        mask_ratio_vals: float = 0.5,
+        mask_ratio_tars: float = 0.0,
+        n_lats: int = 361,
+        n_lons: int = 576,
+    ):
+        self.n_lat_mu = n_lat_mu
+        self.n_lon_mu = n_lon_mu
+        self.mask_ratio_vals = mask_ratio_vals
+        self.mask_ratio_tars = mask_ratio_tars
+        self.padding = padding
+        self.n_lats = n_lats + padding[0][0] + padding[0][1]
+        self.n_lons = n_lons + padding[1][0] + padding[1][1]
+        if self.n_lats % n_lat_mu != 0:
+            raise ValueError(
+                f"Padded latitudes {self.n_lats} are not an integer multiple of the mask unit size {n_lat_mu}."
+            )
+        if self.n_lons % n_lon_mu != 0:
+            raise ValueError(
+                f"Padded longitudes {self.n_lons} are not an integer multiple of the mask unit size {n_lon_mu}."
+            )
+        self.mask_shape = (self.n_lats // self.n_lat_mu, self.n_lons // self.n_lon_mu)
+        self.rng = np.random.default_rng(seed=seed)
+    def n_units_masked(self, mask_type="vals"):
+        if mask_type == "vals":
+            return int(self.mask_ratio_vals * np.prod(self.mask_shape))
+        elif mask_type == "tars":
+            return int(self.mask_ratio_tars * np.prod(self.mask_shape))
+        else:
+            raise ValueError(
+                f"`{mask_type}` not an allowed value for `mask_type`. Use `vals` or `tars`."
+            )
+    @staticmethod
+    def reconstruct(
+        idx_masked: torch.Tensor,
+        idx_unmasked: torch.Tensor,
+        data_masked: torch.Tensor,
+        data_unmasked: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Reconstructs a tensor along the mask unit dimension. Non-batched version.
+        Args:
+            idx_masked: Tensor of shape `mask unit sequence`.
+            idx_unmasked: Tensor of shape `mask unit sequence`.
+            data_masked: Tensor of shape `mask unit sequence, ...`. Should have same size along mask unit sequence dimension as idx_masked. Dimensions beyond the first two, marked here as ... will typically be `local_sequence, channel` or `channel, lat, lon`. These dimensions should agree with data_unmasked.
+            data_unmasked: Tensor of shape `mask unit sequence, ...`. Should have same size along mask unit sequence dimension as idx_unmasked. Dimensions beyond the first two, marked here as ... will typically be `local_sequence, channel` or `channel, lat, lon`. These dimensions should agree with data_masked.
+        Returns:
+            Tensor of same shape as inputs data_masked and data_unmasked. I.e. `mask unit sequence, ...`.
+        """
+        idx_total = torch.argsort(torch.cat([idx_masked, idx_unmasked], dim=0), dim=0)
+        idx_total = idx_total.reshape(
+            *idx_total.shape,
+            *[1 for _ in range(len(idx_total.shape), len(data_unmasked.shape))],
+        )
+        idx_total = idx_total.expand(*idx_total.shape[:1], *data_unmasked.shape[1:])
+        data = torch.cat([data_masked, data_unmasked], dim=0)
+        data = torch.gather(data, dim=0, index=idx_total)
+        return data
+    @staticmethod
+    def reconstruct_batch(
+        idx_masked: torch.Tensor,
+        idx_unmasked: torch.Tensor,
+        data_masked: torch.Tensor,
+        data_unmasked: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Reconstructs a tensor along the mask unit dimension. Batched version.
+        Args:
+            idx_masked: Tensor of shape `batch, mask unit sequence`.
+            idx_unmasked: Tensor of shape `batch, mask unit sequence`.
+            data_masked: Tensor of shape `batch, mask unit sequence, ...`. Should have same size along mask unit sequence dimension as idx_masked. Dimensions beyond the first two, marked here as ... will typically be `local_sequence, channel` or `channel, lat, lon`. These dimensions should agree with data_unmasked.
+            data_unmasked: Tensor of shape `batch, mask unit sequence, ...`. Should have same size along mask unit sequence dimension as idx_unmasked. Dimensions beyond the first two, marked here as ... will typically be `local_sequence, channel` or `channel, lat, lon`. These dimensions should agree with data_masked.
+        Returns:
+            Tensor of same shape as inputs data_masked and data_unmasked. I.e. `batch, mask unit sequence, ...`.
+        """
+        idx_total = torch.argsort(torch.cat([idx_masked, idx_unmasked], dim=1), dim=1)
+        idx_total = idx_total.reshape(
+            *idx_total.shape,
+            *[1 for _ in range(len(idx_total.shape), len(data_unmasked.shape))],
+        )
+        idx_total = idx_total.expand(*idx_total.shape[:2], *data_unmasked.shape[2:])
+        data = torch.cat([data_masked, data_unmasked], dim=1)
+        data = torch.gather(data, dim=1, index=idx_total)
+        return data
+    def __call__(self, data: Tuple[np.array]) -> Tuple[torch.Tensor]:
+        """
+        Args:
+            data: Tuple of numpy tensors. These are interpreted as `(sur_static, ulv_static, sur_vals, ulv_vals, sur_tars, ulv_tars)`.
+        Returns:
+            Tuple of torch tensors. If the target is unmasked (`mask_ratio_tars` is zero), the tuple contains
+            `(static, indices_masked_vals, indices_unmaked_vals, vals, tars)`. When targets are masked as well, we are dealing with
+            `(static, indices_masked_vals, indices_unmaked_vals, vals, indices_masked_tars, indices_unmasked_tars, tars)`.
+            Their shapes are as follows:
+                static: mask unit sequence, channel, lat, lon
+                indices_masked_vals: mask unit sequence
+                indices_unmaked_vals: mask unit sequence
+                vals: mask unit sequence, channel, lat, lon
+                tars: mask unit sequence, channel, lat, lon
+        """
+        sur_static, ulv_static, sur_vals, ulv_vals, sur_tars, ulv_tars = data
+        sur_vals, ulv_vals = np.squeeze(sur_vals, axis=1), np.squeeze(ulv_vals, axis=1)
+        sur_tars, ulv_tars = np.squeeze(sur_tars, axis=1), np.squeeze(ulv_tars, axis=1)
+        vals = np.concatenate(
+            [
+                sur_vals,
+                ulv_vals.reshape(
+                    ulv_vals.shape[0] * ulv_vals.shape[1], *ulv_vals.shape[-2:]
+                ),
+            ],
+            axis=0,
+        )
+        tars = np.concatenate(
+            [
+                sur_tars,
+                ulv_tars.reshape(
+                    ulv_tars.shape[0] * ulv_tars.shape[1], *ulv_tars.shape[-2:]
+                ),
+            ],
+            axis=0,
+        )
+        padding = ((0, 0), *self.padding)
+        static = np.pad(sur_static, padding)
+        vals = np.pad(vals, padding)
+        tars = np.pad(tars, padding)
+        static = static.reshape(
+            static.shape[0],
+            static.shape[-2] // self.n_lat_mu,
+            self.n_lat_mu,
+            static.shape[-1] // self.n_lon_mu,
+            self.n_lon_mu,
+        ).transpose(1, 3, 0, 2, 4)
+        vals = vals.reshape(
+            vals.shape[0],
+            vals.shape[-2] // self.n_lat_mu,
+            self.n_lat_mu,
+            vals.shape[-1] // self.n_lon_mu,
+            self.n_lon_mu,
+        ).transpose(1, 3, 0, 2, 4)
+        tars = tars.reshape(
+            tars.shape[0],
+            tars.shape[-2] // self.n_lat_mu,
+            self.n_lat_mu,
+            tars.shape[-1] // self.n_lon_mu,
+            self.n_lon_mu,
+        ).transpose(1, 3, 0, 2, 4)
+        maskable_indices = np.arange(np.prod(self.mask_shape))
+        maskable_indices = self.rng.permutation(maskable_indices)
+        indices_masked_vals = maskable_indices[: self.n_units_masked()]
+        indices_unmasked_vals = maskable_indices[self.n_units_masked() :]
+        vals = vals.reshape(-1, *vals.shape[2:])[indices_unmasked_vals, :, :, :]
+        if self.mask_ratio_tars > 0.0:
+            maskable_indices = np.arange(np.prod(self.mask_shape))
+            maskable_indices = self.rng.permutation(maskable_indices)
+            indices_masked_tars = maskable_indices[: self.n_units_masked("tars")]
+            indices_unmasked_tars = maskable_indices[self.n_units_masked("tars") :]
+            tars = tars.reshape(-1, *tars.shape[2:])[indices_unmasked_tars, :, :, :]
+            return_value = (
+                torch.from_numpy(static).flatten(0, 1),
+                torch.from_numpy(indices_masked_vals),
+                torch.from_numpy(indices_unmasked_vals),
+                torch.from_numpy(vals),
+                torch.from_numpy(indices_masked_tars),
+                torch.from_numpy(indices_unmasked_tars),
+                torch.from_numpy(tars),
+            )
+            return return_value
+        else:
+            return_value = (
+                torch.from_numpy(static).flatten(0, 1),
+                torch.from_numpy(indices_masked_vals),
+                torch.from_numpy(indices_unmasked_vals),
+                torch.from_numpy(vals),
+                torch.from_numpy(tars).flatten(0, 1),
+            )
+            return return_value

surya/models/__init__.py ADDED Viewed

File without changes

surya/models/embedding.py ADDED Viewed

	@@ -0,0 +1,483 @@

+"""
+Perceiver code is based on Aurora: https://github.com/microsoft/aurora/blob/main/aurora/model/perceiver.py
+Some conventions for notation:
+B - Batch
+T - Time
+H - Height (pixel space)
+W - Width (pixel space)
+HT - Height (token space)
+WT - Width (token space)
+ST - Sequence (token space)
+C - Input channels
+D - Model (embedding) dimension
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from timm.models.layers import trunc_normal_
+class PatchEmbed3D(nn.Module):
+    """Timeseries Image to Patch Embedding"""
+    def __init__(
+        self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, time_dim=2
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.time_dim = time_dim
+        self.proj = nn.Conv2d(
+            in_chans * time_dim,
+            embed_dim,
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+        )
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (B, C, T, H, W)
+        Returns:
+            Tensor of shape (B, ST, D)
+        """
+        B, C, T, H, W = x.shape
+        x = self.proj(x.flatten(1, 2))  # (B, C, T, H, W) -> (B, D, HT, WT)
+        x = rearrange(x, "B D HT WT -> B (HT WT) D")  # (B, N, D)
+        return x
+class LinearEmbedding(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        time_dim=2,
+        embed_dim=768,
+        drop_rate=0.0,
+    ):
+        super().__init__()
+        self.num_patches = (img_size // patch_size) ** 2
+        self.patch_embed = PatchEmbed3D(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            time_dim=time_dim,
+        )
+        self._generate_position_encoding(img_size, patch_size, embed_dim)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+    def _generate_position_encoding(self, img_size, patch_size, embed_dim):
+        """
+        Generates a positional encoding signal for the model. The generated
+        positional encoding signal is stored as a buffer (`self.fourier_signal`).
+        Args:
+            img_size (int): The size of the input image.
+            patch_size (int): The size of each patch in the image.
+            embed_dim (int): The embedding dimension of the model.
+        Returns:
+            None.
+        """
+        # Generate signal of shape (C, H, W)
+        x = torch.linspace(0.0, 1.0, img_size // patch_size)
+        y = torch.linspace(0.0, 1.0, img_size // patch_size)
+        x, y = torch.meshgrid(x, y, indexing="xy")
+        fourier_signal = []
+        frequencies = torch.linspace(1, (img_size // patch_size) / 2.0, embed_dim // 4)
+        for f in frequencies:
+            fourier_signal.extend(
+                [
+                    torch.cos(2.0 * torch.pi * f * x),
+                    torch.sin(2.0 * torch.pi * f * x),
+                    torch.cos(2.0 * torch.pi * f * y),
+                    torch.sin(2.0 * torch.pi * f * y),
+                ]
+            )
+        fourier_signal = torch.stack(fourier_signal, dim=2)
+        fourier_signal = rearrange(fourier_signal, "h w c -> 1 (h w) c")
+        self.register_buffer("pos_embed", fourier_signal)
+    def forward(self, x, dt):
+        """
+        Args:
+            x: Tensor of shape (B, C, T, H, W).
+            dt: Tensor of shape (B, T). However it is not used.
+        Returns:
+            Tensor of shape (B, ST, D)
+        """
+        x = self.patch_embed(x)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        return x
+class LinearDecoder(nn.Module):
+    def __init__(
+        self,
+        patch_size: int,
+        out_chans: int,
+        embed_dim: int,
+    ):
+        """
+        Args:
+            patch_size: patch size
+            in_chans: number of iput channels
+            embed_dim: embedding dimension
+        """
+        super().__init__()
+        self.unembed = nn.Sequential(
+            nn.Conv2d(
+                in_channels=embed_dim,
+                out_channels=(patch_size**2) * out_chans,
+                kernel_size=1,
+            ),
+            nn.PixelShuffle(patch_size),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Tensor of shape (B, L, D). For ensembles, we have implicitly B = (B E).
+        Returns:
+            Tensor of shape (B C H W).
+            Here
+            - C equals num_queries
+            - H == W == sqrt(L) x patch_size
+        """
+        # Reshape the tokens to 2d token space: (B, C, H_token, W_token)
+        _, L, _ = x.shape
+        H_token = W_token = int(L**0.5)
+        x = rearrange(x, "B (H W) D -> B D H W", H=H_token, W=W_token)
+        # Unembed the tokens. Convolution + pixel shuffle.
+        x = self.unembed(x)
+        return x
+class MLP(nn.Module):
+    """A simple one-hidden-layer MLP."""
+    def __init__(self, dim: int, hidden_features: int, dropout: float = 0.0) -> None:
+        """Initialise.
+        Args:
+            dim (int): Input dimensionality.
+            hidden_features (int): Width of the hidden layer.
+            dropout (float, optional): Drop-out rate. Defaults to no drop-out.
+        """
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_features),
+            nn.GELU(),
+            nn.Linear(hidden_features, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Run the MLP."""
+        return self.net(x)
+class PerceiverAttention(nn.Module):
+    """Cross attention module from the Perceiver architecture."""
+    def __init__(
+        self,
+        latent_dim: int,
+        context_dim: int,
+        head_dim: int = 64,
+        num_heads: int = 8,
+    ) -> None:
+        """Initialise.
+        Args:
+            latent_dim (int): Dimensionality of the latent features given as input.
+            context_dim (int): Dimensionality of the context features also given as input.
+            head_dim (int): Attention head dimensionality.
+            num_heads (int): Number of heads.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.inner_dim = head_dim * num_heads
+        self.to_q = nn.Linear(latent_dim, self.inner_dim, bias=False)
+        self.to_kv = nn.Linear(context_dim, self.inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(self.inner_dim, latent_dim, bias=False)
+    def forward(self, latents: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+        """Run the cross-attention module.
+        Args:
+            latents (:class:`torch.Tensor`): Latent features of shape `(B, L1, Latent_D)`
+                where typically `L1 < L2` and `Latent_D <= Context_D`. `Latent_D` is equal to
+                `self.latent_dim`.
+            x (:class:`torch.Tensor`): Context features of shape `(B, L2, Context_D)`.
+        Returns:
+            :class:`torch.Tensor`: Latent values of shape `(B, L1, Latent_D)`.
+        """
+        h = self.num_heads
+        q = self.to_q(latents)  # (B, L1, D2) to (B, L1, D)
+        k, v = self.to_kv(x).chunk(2, dim=-1)  # (B, L2, D1) to twice (B, L2, D)
+        q, k, v = map(lambda t: rearrange(t, "b l (h d) -> b h l d", h=h), (q, k, v))
+        out = F.scaled_dot_product_attention(q, k, v)
+        out = rearrange(out, "B H L1 D -> B L1 (H D)")  # (B, L1, D)
+        return self.to_out(out)  # (B, L1, Latent_D)
+class PerceiverResampler(nn.Module):
+    """Perceiver Resampler module from the Flamingo paper."""
+    def __init__(
+        self,
+        latent_dim: int,
+        context_dim: int,
+        depth: int = 1,
+        head_dim: int = 64,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        drop: float = 0.0,
+        residual_latent: bool = True,
+        ln_eps: float = 1e-5,
+    ) -> None:
+        """Initialise.
+        Args:
+            latent_dim (int): Dimensionality of the latent features given as input.
+            context_dim (int): Dimensionality of the context features also given as input.
+            depth (int, optional): Number of attention layers.
+            head_dim (int, optional): Attention head dimensionality. Defaults to `64`.
+            num_heads (int, optional): Number of heads. Defaults to `16`
+            mlp_ratio (float, optional): Rimensionality of the hidden layer divided by that of the
+                input for all MLPs. Defaults to `4.0`.
+            drop (float, optional): Drop-out rate. Defaults to no drop-out.
+            residual_latent (bool, optional): Use residual attention w.r.t. the latent features.
+                Defaults to `True`.
+            ln_eps (float, optional): Epsilon in the layer normalisation layers. Defaults to
+                `1e-5`.
+        """
+        super().__init__()
+        self.residual_latent = residual_latent
+        self.layers = nn.ModuleList([])
+        mlp_hidden_dim = int(latent_dim * mlp_ratio)
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(
+                            latent_dim=latent_dim,
+                            context_dim=context_dim,
+                            head_dim=head_dim,
+                            num_heads=num_heads,
+                        ),
+                        MLP(
+                            dim=latent_dim, hidden_features=mlp_hidden_dim, dropout=drop
+                        ),
+                        nn.LayerNorm(latent_dim, eps=ln_eps),
+                        nn.LayerNorm(latent_dim, eps=ln_eps),
+                    ]
+                )
+            )
+    def forward(self, latents: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+        """Run the module.
+        Args:
+            latents (:class:`torch.Tensor`): Latent features of shape `(B, L1, D1)`.
+            x (:class:`torch.Tensor`): Context features of shape `(B, L2, D1)`.
+        Returns:
+            torch.Tensor: Latent features of shape `(B, L1, D1)`.
+        """
+        for attn, ff, ln1, ln2 in self.layers:
+            # We use post-res-norm like in Swin v2 and most Transformer architectures these days.
+            # This empirically works better than the pre-norm used in the original Perceiver.
+            attn_out = ln1(attn(latents, x))
+            # HuggingFace suggests using non-residual attention in Perceiver might work better when
+            # the semantics of the query and the output are different:
+            #
+            #   https://github.com/huggingface/transformers/blob/v4.35.2/src/transformers/models/perceiver/modeling_perceiver.py#L398
+            #
+            latents = attn_out + latents if self.residual_latent else attn_out
+            latents = ln2(ff(latents)) + latents
+        return latents
+class PerceiverChannelEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_chans: int,
+        img_size: int,
+        patch_size: int,
+        time_dim: int,
+        num_queries: int,
+        embed_dim: int,
+        drop_rate: float,
+    ):
+        super().__init__()
+        if embed_dim % 2 != 0:
+            raise ValueError(
+                f"Temporal embeddings require `embed_dim` to be even. Currently we have {embed_dim}."
+            )
+        self.num_patches = (img_size // patch_size) ** 2
+        self.num_queries = num_queries
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(
+            in_channels=in_chans * time_dim,
+            out_channels=in_chans * embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            groups=in_chans,
+        )
+        self.pos_embed = nn.Parameter(torch.zeros(1, embed_dim, self.num_patches))
+        trunc_normal_(self.pos_embed, std=0.02)
+        self.latent_queries = nn.Parameter(torch.zeros(1, num_queries, embed_dim))
+        trunc_normal_(self.latent_queries, std=0.02)
+        self.perceiver = PerceiverResampler(
+            latent_dim=embed_dim,
+            context_dim=embed_dim,
+            depth=1,
+            head_dim=embed_dim // 16,
+            num_heads=16,
+            mlp_ratio=4.0,
+            drop=0.0,
+            residual_latent=False,
+            ln_eps=1e-5,
+        )
+        self.latent_aggregation = nn.Linear(num_queries * embed_dim, embed_dim)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+    def forward(self, x, dt):
+        """
+        Args:
+            x: Tensor of shape (B, C, T, H, W)
+            dt: Tensor of shape (B, T) identifying time deltas.
+        Returns:
+            Tensor of shape (B, ST, D)
+        """
+        B, C, T, H, W = x.shape
+        x = rearrange(x, "B C T H W -> B (C T) H W")
+        x = self.proj(x)  # B (C T) H W -> B (C D) HT WT
+        x = x.flatten(2, 3)  # B (C D) ST
+        ST = x.shape[2]
+        assert ST == self.num_patches
+        x = rearrange(x, "B (C D) ST -> (B C) D ST", B=B, ST=ST, C=C, D=self.embed_dim)
+        x = x + self.pos_embed
+        x = rearrange(x, "(B C) D ST -> (B ST) C D", B=B, ST=ST, C=C, D=self.embed_dim)
+        # ((B ST) NQ D), ((B ST) C D) -> ((B ST) NQ D)
+        x = self.perceiver(self.latent_queries.expand(B * ST, -1, -1), x)
+        x = rearrange(
+            x,
+            "(B ST) NQ D -> B ST (NQ D)",
+            B=B,
+            ST=self.num_patches,
+            NQ=self.num_queries,
+            D=self.embed_dim,
+        )
+        x = self.latent_aggregation(x)  # B ST (NQ D) -> B ST D'
+        assert x.shape[1] == self.num_patches
+        assert x.shape[2] == self.embed_dim
+        x = self.pos_drop(x)
+        return x
+class PerceiverDecoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        patch_size: int,
+        out_chans: int,
+    ):
+        """
+        Args:
+            embed_dim: embedding dimension
+            patch_size: patch size
+            out_chans: number of output channels. This determines the number of latent queries.
+            drop_rate: dropout rate
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.patch_size = patch_size
+        self.out_chans = out_chans
+        self.latent_queries = nn.Parameter(torch.zeros(1, out_chans, embed_dim))
+        trunc_normal_(self.latent_queries, std=0.02)
+        self.perceiver = PerceiverResampler(
+            latent_dim=embed_dim,
+            context_dim=embed_dim,
+            depth=1,
+            head_dim=embed_dim // 16,
+            num_heads=16,
+            mlp_ratio=4.0,
+            drop=0.0,
+            residual_latent=False,
+            ln_eps=1e-5,
+        )
+        self.proj = nn.Conv2d(
+            in_channels=out_chans * embed_dim,
+            out_channels=out_chans * patch_size**2,
+            kernel_size=1,
+            padding=0,
+            groups=out_chans,
+        )
+        self.pixel_shuffle = nn.PixelShuffle(patch_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Tensor of shape (B, L, D) For ensembles, we have implicitly B = (B E).
+        Returns:
+            Tensor of shape (B C H W).
+            Here
+            - C equals out_chans
+            - H == W == sqrt(L) x patch_size
+        """
+        B, L, D = x.shape
+        H_token = W_token = int(L**0.5)
+        x = rearrange(x, "B L D -> (B L) 1 D")
+        # (B L) 1 D -> (B L) C D
+        x = self.perceiver(self.latent_queries.expand(B * L, -1, -1), x)
+        x = rearrange(x, "(B H W) C D -> B (C D) H W", H=H_token, W=W_token)
+        # B (C D) H_token W_token -> B (C patch_size patch_size) H_token W_token
+        x = self.proj(x)
+        # B (C patch_size patch_size) H_token W_token -> B C H W
+        x = self.pixel_shuffle(x)
+        return x

surya/models/flow.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class HelioFlowModel(nn.Module):
+    def __init__(self, img_size=(4096, 4096), use_latitude_in_learned_flow=False):
+        super().__init__()
+        self.use_latitude_in_learned_flow = use_latitude_in_learned_flow
+        u = torch.linspace(-1, 1, img_size[0])
+        v = torch.linspace(-1, 1, img_size[1])
+        u, v = torch.meshgrid(u, v, indexing="xy")
+        self.register_buffer(
+            "grid", torch.stack((u, v), dim=2).view(1, *img_size, 2)
+        )  # B, H, W, 2
+        # Higher modes can be used for explicit feature engineering for flow features.
+        if self.use_latitude_in_learned_flow:
+            higher_modes = [u, v, torch.ones_like(u)]
+        else:
+            higher_modes = [
+                u,
+                v,
+            ]
+        self.register_buffer(
+            "higher_modes", torch.stack(higher_modes, dim=2).view(1, *img_size, -1)
+        )
+        self.flow_generator = nn.Sequential(
+            nn.Linear(self.higher_modes.shape[3], 128),
+            nn.GELU(),
+            nn.Linear(128, 2),
+        )
+    def forward(self, batch):
+        """
+        Args:
+            batch: Dictionary containing keys `ts` and
+            `forecast_latitude` (optionally).
+                ts (torch.Tensor):                B, C, T, H, W
+                forecast_latitude (torch.Tensor): B, L
+            B - Batch size, C - Channels, T - Input times, H - Image height,
+            W - Image width, L - Lead time.
+        """
+        x = batch["ts"]
+        B, C, T, H, W = x.shape
+        if T == 1:
+            x = x[:, :, -1, :, :]
+        else:
+            # Taking the average of the last two time stamps
+            x = (x[:, :, -1, :, :] + x[:, :, -2, :, :]) / 2
+        # Flow fields have the shape B, H_out, W_out, 2
+        if self.use_latitude_in_learned_flow:
+            broadcast_lat = batch["forecast_latitude"] / 7
+            broadcast_lat = torch.concatenate(
+                [
+                    torch.ones_like(broadcast_lat),
+                    torch.ones_like(broadcast_lat),
+                    broadcast_lat,
+                ],
+                1,
+            )[:, None, None, :]
+            higher_modes = self.higher_modes * broadcast_lat
+            flow_field = self.grid + self.flow_generator(higher_modes)
+        else:
+            flow_field = self.grid + self.flow_generator(self.higher_modes)
+        flow_field = flow_field.expand(B, H, W, 2)
+        y_hat = F.grid_sample(
+            x,
+            flow_field,
+            mode="bilinear",
+            padding_mode="border",  # Possible values: zeros, border, or reflection.
+            align_corners=False,
+        )
+        return y_hat

surya/models/helio_spectformer.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import torch
+from einops import rearrange
+from torch import nn
+import numpy as np
+from .spectformer import SpectFormer, BlockSpectralGating, BlockAttention
+from .embedding import (
+    LinearEmbedding,
+    PatchEmbed3D,
+    PerceiverChannelEmbedding,
+    LinearDecoder,
+    PerceiverDecoder,
+)
+from .flow import HelioFlowModel
+class HelioSpectFormer(nn.Module):
+    """
+    A note on the ensemble capability:
+        Ensembles of size E are generated by setting `ensemble=E`. In this case, the forward
+        pass generates ensemble members after tokenization by increasing the batch dimension
+        B to B x E. Noise is injected in the `self.backbone` Specformer blocks. After the
+        backbone, ensemble members ride along implicitly in the batch dimension. (This is
+        mainly through the `self.unembed` pass.) An explicit ensemble dimension is only
+        generated at the end.
+    """
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        embed_dim: int,
+        time_embedding: dict,
+        depth: int,
+        n_spectral_blocks: int,
+        num_heads: int,
+        mlp_ratio: float,
+        drop_rate: float,
+        window_size: int,
+        dp_rank: int,
+        learned_flow: bool = False,
+        use_latitude_in_learned_flow: bool = False,
+        init_weights: bool = False,
+        checkpoint_layers: list[int] | None = None,
+        rpe: bool = False,
+        ensemble: int | None = None,
+        finetune: bool = True,
+        nglo: int = 0,
+        dtype: torch.dtype | None = None,
+    ) -> None:
+        """
+        Args:
+            img_size: input image size
+            patch_size: patch size
+            in_chans: number of iput channels
+            embed_dim: embeddin dimension
+            time_embedding: dictionary to configure temporal embedding:
+                `type` (str, required): indicates embedding type. `linear`, `perceiver`.
+                `time_dim` (int): indicates length of time dimension. required for linear embedding.
+                `n_queries` (int): indicates number of perceiver queries. required for perceiver.
+            depth: number of transformer blocks
+            n_spectral_blocks: number of spectral gating blocks
+            num_heads: Number of transformer heads
+            mlp_ratio: MLP ratio for transformer blocks
+            drop_rate: dropout rate
+            window_size: window size for long/short attention
+            dp_rank: dp rank for long/short attention
+            learned_flow: if true, combine learned flow model with spectformer
+            use_latitude_in_learned_flow: use latitudes in learned flow
+            init_weights: use optimized weight initialization
+            checkpoint_layers: indicate which layers to use for checkpointing
+            rpe: Use relative position encoding in Long-Short attention blocks.
+            ensemble: Integer indicating ensemble size or None for deterministic model.
+            finetune: Indicates whether to train from scrach or fine-tune the model. If set to `True`, the final output layers are removed.
+            nglo: Number of (additional) global tokens.
+            dtype: A torch data type. Not used and added only for compatibility with the remainder of the codebase.
+        """
+        super().__init__()
+        self.learned_flow = learned_flow
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.in_chans = in_chans
+        self.time_embedding = time_embedding
+        self.ensemble = ensemble
+        self.finetune = finetune
+        self.nglo = nglo
+        if learned_flow:
+            self.learned_flow_model = HelioFlowModel(
+                img_size=(img_size, img_size),
+                use_latitude_in_learned_flow=use_latitude_in_learned_flow,
+            )
+        match time_embedding["type"]:
+            case "linear":
+                self.time_dim = time_embedding["time_dim"]
+                if learned_flow:
+                    self.time_dim += 1
+                self.embedding = LinearEmbedding(
+                    img_size, patch_size, in_chans, self.time_dim, embed_dim, drop_rate
+                )
+                if not self.finetune:
+                    self.unembed = LinearDecoder(
+                        patch_size=patch_size, out_chans=in_chans, embed_dim=embed_dim
+                    )
+            case "perceiver":
+                self.embedding = PerceiverChannelEmbedding(
+                    in_chans=in_chans,
+                    img_size=img_size,
+                    patch_size=patch_size,
+                    time_dim=time_embedding["time_dim"],
+                    num_queries=time_embedding["n_queries"],
+                    embed_dim=embed_dim,
+                    drop_rate=drop_rate,
+                )
+                if not self.finetune:
+                    self.unembed = PerceiverDecoder(
+                        embed_dim=embed_dim,
+                        patch_size=patch_size,
+                        out_chans=in_chans,
+                    )
+            case _:
+                raise NotImplementedError(
+                    f'Embedding {time_embedding["type"]} has not been implemented.'
+                )
+        if isinstance(depth, list):
+            raise NotImplementedError(
+                "Multi scale models are no longer supported. Depth should be a single integer."
+            )
+        self.backbone = SpectFormer(
+            grid_size=img_size // patch_size,
+            embed_dim=embed_dim,
+            depth=depth,
+            n_spectral_blocks=n_spectral_blocks,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            drop_rate=drop_rate,
+            window_size=window_size,
+            dp_rank=dp_rank,
+            checkpoint_layers=checkpoint_layers,
+            rpe=rpe,
+            ensemble=ensemble,
+            nglo=nglo,
+        )
+        if init_weights:
+            self.apply(self._init_weights)
+    # @staticmethod
+    # def _checkpoint_wrapper(
+    #    model: nn.Module, data: tuple[Tensor, Tensor | None]
+    # ) -> Tensor:
+    #    return checkpoint(model, data, use_reentrant=False)
+    def _init_weights(self, module):
+        if self.time_embedding["type"] == "linear":
+            # sampling_step * embed_dim = patch_size**2 * in_chans * time_dim
+            sampling_step = int(
+                np.sqrt(
+                    (self.patch_size**2 * self.in_chans * self.time_dim)
+                    / self.embed_dim
+                )
+            )
+        else:
+            sampling_step = int(
+                np.sqrt((self.patch_size**2 * self.in_chans) / self.embed_dim)
+            )
+        if isinstance(module, PatchEmbed3D):
+            torch.nn.init.zeros_(module.proj.weight)
+            c_out = 0
+            w_pool = 1.0 / sampling_step
+            for k in range(self.in_chans * self.time_dim):
+                for i in range(0, self.patch_size, sampling_step):
+                    for j in range(0, self.patch_size, sampling_step):
+                        module.proj.weight.data[
+                            c_out, k, i : i + sampling_step, j : j + sampling_step
+                        ] = w_pool
+                        c_out += 1
+            if module.proj.bias is not None:
+                module.proj.bias.data.zero_()
+        if isinstance(module, BlockSpectralGating):
+            for m in [
+                module.mlp.fc1,
+                module.mlp.fc2,
+            ]:
+                # m.weight.data.normal_(mean=0.0, std=0.01)
+                # torch.nn.init.eye_(m.weight)
+                torch.nn.init.eye_(m.weight)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+        if isinstance(module, BlockAttention):
+            for m in [
+                module.mlp.fc1,
+                module.mlp.fc2,
+            ]:
+                # torch.nn.init.eye_(m.weight)
+                torch.nn.init.zeros_(m.weight)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            for m in [
+                module.attn.qkv,
+                module.attn.proj,
+                module.attn.to_dynamic_projection,
+            ]:
+                # m.weight.data.normal_(mean=0.0, std=0.01)
+                # torch.nn.init.eye_(m.weight)
+                torch.nn.init.zeros_(m.weight)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+        if isinstance(module, torch.nn.Sequential):
+            if isinstance(module[1], torch.nn.PixelShuffle):
+                # torch.nn.init.eye_(module[0].weight.data[:,:,0,0])
+                torch.nn.init.zeros_(module[0].weight)
+                if self.time_embedding["type"] == "linear":
+                    c_out = 0
+                    for k in range(1, self.in_chans + 1):
+                        for i in range(
+                            self.patch_size**2 // (self.patch_size * sampling_step)
+                        ):
+                            for j in range(self.patch_size):
+                                module[0].weight.data[
+                                    c_out : c_out + sampling_step,
+                                    j + (k * self.time_dim - 1) * self.patch_size,
+                                ] = 1.0
+                                c_out += sampling_step
+                else:
+                    c_out = 0
+                    for k in range(2 * self.in_chans):
+                        # l = 0
+                        for l_feat in range(self.backbone.embed_dim):
+                            module[0].weight.data[c_out, l_feat] = 1.0
+                            c_out += 1
+                if module[0].bias is not None:
+                    module[0].bias.data.zero_()
+    def forward(self, batch):
+        """
+        Args:
+            batch: Dictionary containing keys `ts` and `time_delta_input`.
+            Their values are tensors with shapes as follows.
+                ts:                B, C, T, H, W
+                time_delta_input:  B, T
+        Returns:
+            Tensor fo shape (B, C, H, W) for deterministic or (B, E, C, H, W) for ensemble forecasts.
+        """
+        x = batch["ts"]
+        dt = batch["time_delta_input"]
+        B, C, T, H, W = x.shape
+        if self.learned_flow:
+            y_hat_flow = self.learned_flow_model(batch)  # B, C, H, W
+            if any(
+                [param.requires_grad for param in self.learned_flow_model.parameters()]
+            ):
+                return y_hat_flow
+            else:
+                x = torch.concat((x, y_hat_flow.unsqueeze(2)), dim=2)  # B, C, T+1, H, W
+                if self.time_embedding["type"] == "perceiver":
+                    dt = torch.cat((dt, batch["lead_time_delta"].reshape(-1, 1)), dim=1)
+        # embed the data
+        tokens = self.embedding(x, dt)
+        # copy tokens in case of ensemble forecast
+        if self.ensemble:
+            # B L D -> (B E) L D == BE L D
+            tokens = torch.repeat_interleave(tokens, repeats=self.ensemble, dim=0)
+        # pass the time series through the encoder
+        tokens = self.backbone(tokens)
+        if self.finetune:
+            return tokens
+        # Unembed the tokens
+        # BE L D -> BE C H W
+        forecast_hat = self.unembed(tokens)
+        assert forecast_hat.shape == (
+            B * self.ensemble if self.ensemble else B,
+            C,
+            H,
+            W,
+        ), f"forecast_hat has shape {forecast_hat.shape} yet expected {(B*self.ensemble if self.ensemble else B, C, H, W)}."
+        if self.learned_flow:
+            assert y_hat_flow.shape == (
+                B,
+                C,
+                H,
+                W,
+            ), f"y_hat_flow has shape {y_hat_flow.shape} yet expected {(B, C, H, W)}."
+            if self.ensemble:
+                y_hat_flow = torch.repeat_interleave(
+                    y_hat_flow, repeats=self.ensemble, dim=0
+                )
+            assert y_hat_flow.shape == forecast_hat.shape
+            forecast_hat = forecast_hat + y_hat_flow
+        assert forecast_hat.shape == (
+            B * self.ensemble if self.ensemble else B,
+            C,
+            H,
+            W,
+        ), f"forecast_hat has shape {forecast_hat.shape} yet expected {(B*self.ensemble if self.ensemble else B, C, H, W)}."
+        if self.ensemble:
+            forecast_hat = rearrange(
+                forecast_hat, "(B E) C H W -> B E C H W", B=B, E=self.ensemble
+            )
+        return forecast_hat

surya/models/spectformer.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import math
+import logging
+from itertools import chain
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from timm.models.layers import DropPath, trunc_normal_
+import torch.fft
+from .transformer_ls import AttentionLS
+_logger = logging.getLogger(__name__)
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class SpectralGatingNetwork(nn.Module):
+    def __init__(self, dim, h=14, w=8):
+        super().__init__()
+        self.complex_weight = nn.Parameter(torch.randn(h, w, dim, 2) * 0.02)
+        self.w = w
+        self.h = h
+    def forward(self, x, spatial_size=None):
+        B, N, C = x.shape  # torch.Size([1, 262144, 1024])
+        if spatial_size is None:
+            a = b = int(math.sqrt(N))  # a=b=512
+        else:
+            a, b = spatial_size
+        x = x.view(B, a, b, C)  # torch.Size([1, 512, 512, 1024])
+        # FROM HERE USED TO BE AUTOCAST to float32
+        dtype = x.dtype
+        x = x.to(torch.float32)
+        x = torch.fft.rfft2(
+            x, dim=(1, 2), norm="ortho"
+        )  # torch.Size([1, 512, 257, 1024])
+        weight = torch.view_as_complex(
+            self.complex_weight.to(torch.float32)
+        )  # torch.Size([512, 257, 1024])
+        x = x * weight
+        x = torch.fft.irfft2(
+            x, s=(a, b), dim=(1, 2), norm="ortho"
+        )  # torch.Size([1, 512, 512, 1024])
+        x = x.to(dtype)
+        x = x.reshape(B, N, C)  # torch.Size([1, 262144, 1024])
+        # UP TO HERE USED TO BE AUTOCAST to float32
+        return x
+class BlockSpectralGating(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mlp_ratio=4.0,
+        drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        h=14,
+        w=8,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.filter = SpectralGatingNetwork(dim, h=h, w=w)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+    def forward(self, x, *args):
+        x = x + self.drop_path(self.mlp(self.norm2(self.filter(self.norm1(x)))))
+        return x
+class BlockAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads: int = 8,
+        mlp_ratio=4.0,
+        drop=0.0,
+        drop_path=0.0,
+        w=2,
+        dp_rank=2,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        rpe=False,
+        adaLN=False,
+        nglo=0,
+    ):
+        """
+        num_heads: Attention heads. 4 for tiny, 8 for small and 12 for base
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        self.attn = AttentionLS(
+            dim=dim,
+            num_heads=num_heads,
+            w=w,
+            dp_rank=dp_rank,
+            nglo=nglo,
+            rpe=rpe,
+        )
+        if adaLN:
+            self.adaLN_modulation = nn.Sequential(
+                nn.Linear(dim, dim, bias=True),
+                act_layer(),
+                nn.Linear(dim, 6 * dim, bias=True),
+            )
+        else:
+            self.adaLN_modulation = None
+    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        if self.adaLN_modulation is not None:
+            (
+                shift_mha,
+                scale_mha,
+                gate_mha,
+                shift_mlp,
+                scale_mlp,
+                gate_mlp,
+            ) = self.adaLN_modulation(c).chunk(6, dim=2)
+        else:
+            shift_mha, scale_mha, gate_mha, shift_mlp, scale_mlp, gate_mlp = 6 * (1.0,)
+        x = x + gate_mha * self.drop_path(
+            self.attn(
+                self.norm1(x) * scale_mha + shift_mha,
+            )
+        )
+        x = x + gate_mlp * self.drop_path(
+            self.mlp(self.norm2(x) * scale_mlp + shift_mlp)
+        )
+        return x
+class SpectFormer(nn.Module):
+    def __init__(
+        self,
+        grid_size: int = 224 // 16,
+        embed_dim=768,
+        depth=12,
+        n_spectral_blocks=4,
+        num_heads: int = 8,
+        mlp_ratio=4.0,
+        uniform_drop=False,
+        drop_rate=0.0,
+        drop_path_rate=0.0,
+        window_size=2,
+        dp_rank=2,
+        norm_layer=nn.LayerNorm,
+        checkpoint_layers: list[int] | None = None,
+        rpe=False,
+        ensemble: int | None = None,
+        nglo: int = 0,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            n_spectral_blocks (int): number of spectral gating blocks
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            uniform_drop (bool): true for uniform, false for linearly increasing drop path probability.
+            drop_rate (float): dropout rate
+            drop_path_rate (float): drop path (stochastic depth) rate
+            window_size: window size for long/short attention
+            dp_rank: dp rank for long/short attention
+            norm_layer: (nn.Module): normalization layer for attention blocks
+            checkpoint_layers: indicate which layers to use for checkpointing
+            rpe: Use relative position encoding in Long-Short attention blocks.
+            ensemble: Integer indicating ensemble size or None for deterministic model.
+            nglo: Number of (additional) global tokens.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.n_spectral_blocks = n_spectral_blocks
+        self._checkpoint_layers = checkpoint_layers or []
+        self.ensemble = ensemble
+        self.nglo = nglo
+        h = grid_size
+        w = h // 2 + 1
+        if uniform_drop:
+            _logger.info(f"Using uniform droppath with expect rate {drop_path_rate}.")
+            dpr = [drop_path_rate for _ in range(depth)]
+        else:
+            _logger.info(
+                f"Using linear droppath with expect rate {drop_path_rate * 0.5}."
+            )
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks_spectral_gating = nn.ModuleList()
+        self.blocks_attention = nn.ModuleList()
+        for i in range(depth):
+            if i < n_spectral_blocks:
+                layer = BlockSpectralGating(
+                    dim=embed_dim,
+                    mlp_ratio=mlp_ratio,
+                    drop=drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    h=h,
+                    w=w,
+                )
+                self.blocks_spectral_gating.append(layer)
+            else:
+                layer = BlockAttention(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    drop=drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    w=window_size,
+                    dp_rank=dp_rank,
+                    rpe=rpe,
+                    adaLN=True if ensemble is not None else False,
+                    nglo=nglo,
+                )
+                self.blocks_attention.append(layer)
+        self.apply(self._init_weights)
+    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            tokens: Tensor of shape B, N, C for deterministic of BxE, N, C for ensemble forecast.
+        Returns:
+            Tensor of same shape as input.
+        """
+        if self.ensemble:
+            BE, N, C = tokens.shape
+            noise = torch.randn(
+                size=(BE, N, C), dtype=tokens.dtype, device=tokens.device
+            )
+        else:
+            noise = None
+        for i, blk in enumerate(
+            chain(self.blocks_spectral_gating, self.blocks_attention)
+        ):
+            if i in self._checkpoint_layers:
+                tokens = checkpoint(blk, tokens, noise, use_reentrant=False)
+            else:
+                tokens = blk(tokens, noise)
+        return tokens
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)

surya/models/transformer_ls.py ADDED Viewed

	@@ -0,0 +1,369 @@

+# Copyright (c) 2021 NVIDIA CORPORATION. Licensed under the MIT license.
+# Written by Chen Zhu during an internship at NVIDIA, [email protected]
+import math
+from torch import nn
+import torch
+from timm.models.layers import trunc_normal_
+import torch.nn.functional as F
+class AttentionLS(nn.Module):
+    """Implementation for long-short term attention.
+    Flexible options for using window attention, global token and dynamic projection.
+    Args:
+        dim: input and output feature dimension.
+        num_heads: number of attention heads.
+        qkv_bias: whether to use bias for the projection of query, key and values.
+        qk_scale: scale factor on query and key for numerical stability.
+                  By default, set to square root of head dimensions.
+        attn_drop: dropout probability for attention matrix.
+        proj_drop: dropout probability for the final output.
+        rpe: whether to use relative position encoding.
+        nglo: number of global tokens (e.g., CLS).
+    """
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        rpe=False,
+        nglo=1,
+        dp_rank=2,
+        w=2,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.nglo = nglo
+        # Equals to segment size (w) in the paper.
+        self.window_size = w
+        # Equals to r in the paper.
+        self.dp_rank = dp_rank
+        if self.dp_rank > 0:
+            self.to_dynamic_projection = nn.Linear(dim, dp_rank * num_heads)
+        # The LN of DualLN corresponding to dynamic projection
+        self.dual_ln_dp = nn.LayerNorm(dim)
+        # The LN of DualLN corresponding to all the tokens
+        self.dual_ln_full = nn.LayerNorm(dim)
+        # Adapted from ViL: https://github.com/microsoft/vision-longformer/blob/main/src/models/layers/longformer2d.py#L55-L100
+        # We only add RPE to window attention.
+        # Unnecessary to add bias for global tokens, since DualLN already adds biases.
+        self.rpe = rpe
+        if rpe:
+            # handle the boarder conditions...
+            w_pad = int(w * 0.5)
+            self.local_relative_position_bias_table = nn.Parameter(
+                torch.zeros(2 * (w + w_pad - 1) * (2 * w_pad + w + 1) + 1, num_heads)
+            )
+            trunc_normal_(self.local_relative_position_bias_table, std=0.02)
+            # get pair-wise relative position index
+            coords_h = torch.arange(-w_pad, w_pad + w)
+            coords_w = torch.arange(-w_pad, w_pad + w)
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, 2w, 2w
+            coords = (
+                coords.view(2, (w + w_pad * 2) ** 2).transpose(0, 1).unsqueeze(0)
+            )  # 1, 4w**2, 2
+            q_coords_hw = torch.arange(0, w)
+            q_coords = torch.stack(
+                torch.meshgrid([q_coords_hw, q_coords_hw])
+            )  # 2, w, w
+            q_coords = q_coords.view(2, w**2).transpose(0, 1).unsqueeze(1)  # w**2, 1, 2
+            relative_coords = q_coords - coords
+            relative_coords += w_pad + w - 1  # shift to start from 0
+            relative_coords[:, :, 0] *= 2 * w_pad + w
+            relative_position_index = relative_coords.sum(-1)  # w^2, 4w^2
+            self.register_buffer("relative_position_index", relative_position_index)
+    def forward(self, x, nx=None, ny=None):
+        B, N, C = x.shape
+        N_feat = N - self.nglo
+        self.img_size = int(math.sqrt(N)) if nx is None else nx
+        qkv = self.qkv(x)
+        # query, key, value
+        q, k, v = qkv.chunk(3, dim=2)
+        q = q.mul(self.scale)
+        # Layer norm on the projected keys and values
+        k = self.dual_ln_full(k)
+        v = self.dual_ln_full(v)
+        # output size: bsz x n_heads x seqlen x d
+        if self.nglo > 0:
+            q_cls, q = q[:, : self.nglo], q[:, self.nglo :]
+            k_cls, k = k[:, : self.nglo], k[:, self.nglo :]
+            v_cls, v = v[:, : self.nglo], v[:, self.nglo :]
+            q_cls = q_cls.reshape(
+                B, self.nglo, self.num_heads, C // self.num_heads
+            ).transpose(1, 2)
+            k_cls = k_cls.reshape(
+                B, self.nglo, self.num_heads, C // self.num_heads
+            ).transpose(1, 2)
+            v_cls = v_cls.reshape(
+                B, self.nglo, self.num_heads, C // self.num_heads
+            ).transpose(1, 2)
+        q = q.reshape(B, N_feat, self.num_heads, C // self.num_heads).transpose(1, 2)
+        k = k.reshape(B, N_feat, self.num_heads, C // self.num_heads).transpose(1, 2)
+        v = v.reshape(B, N_feat, self.num_heads, C // self.num_heads).transpose(1, 2)
+        # Long-range Attention (Dynamic Projection)
+        if self.dp_rank > 0:
+            # b x h x r x (l w)
+            # Compute the projection matrix (P_i in the paper)
+            c_scores = (
+                self.to_dynamic_projection(x[:, self.nglo :])
+                .transpose(1, 2)
+                .contiguous()
+                .view(B, self.num_heads, self.dp_rank, -1)
+            )
+            # c_scores = c_scores.softmax(dim=-1, dtype=torch.float32).to(x)
+            c_scores = c_scores.softmax(dim=-1).to(
+                x
+            )  # Changed when experimenting with mixed precision (Johannes S.)
+            # b x h x r x d
+            k_lms = c_scores.matmul(k)
+            k_lms = k_lms.transpose(1, 2).contiguous().view(B, self.dp_rank, -1)
+            k_lms = (
+                self.dual_ln_dp(k_lms)
+                .view(B, self.dp_rank, self.num_heads, -1)
+                .contiguous()
+                .permute(0, 2, 3, 1)
+            )
+            # b x h x (lw) x r
+            dots_all = q.matmul(k_lms)
+            if self.window_size > 0:
+                # Switch the order of dimensions if using window attention.
+                dots_all = self.group_dots(dots_all)
+        else:
+            dots_all = None
+        # Short-term Attention (Window Attention)
+        # In our window attention, each token attends to at most (4w^2) tokens.
+        if self.window_size > 0:
+            dots_win = self.compute_window_scores(q, k)
+            w2 = int(self.window_size * self.window_size)
+            if self.rpe:
+                w_pad = int(0.5 * self.window_size)
+                local_relative_position_bias = self.local_relative_position_bias_table[
+                    self.relative_position_index.view(-1)
+                ].view(
+                    1, w2, (w_pad * 2 + self.window_size) ** 2, -1
+                )  # w^2, kv_nums,H
+                local_relative_position_bias = (
+                    local_relative_position_bias.permute(0, 3, 1, 2)
+                    .expand(B, -1, -1, -1)
+                    .unsqueeze(2)
+                    .unsqueeze(2)
+                )
+                dots_win += local_relative_position_bias
+            if dots_all is None:
+                dots_all = dots_win
+            else:
+                dots_all = torch.cat([dots_all, dots_win], dim=-1)
+        # Global token.
+        if self.nglo > 0:
+            # and compute the scores of queries on CLS
+            dots_q_cls = q.matmul(k_cls.transpose(-1, -2))
+            if self.window_size > 0:
+                dots_q_cls = self.group_dots(dots_q_cls)
+            dots_all = torch.cat([dots_all, dots_q_cls], dim=-1)
+        # attn = dots_all.softmax(dim=-1, dtype=torch.float32).to(x)
+        attn = dots_all.softmax(dim=-1).to(
+            x
+        )  # Changed when experimenting with mixed precision (Johannes S.)
+        attn = self.attn_drop(attn)
+        out = 0
+        if self.window_size > 0:
+            offset = max(0, self.dp_rank)
+            kv_group_size = self.window_size
+            total_win_size = max(1, self.window_size // 2) * 2 + kv_group_size
+            attn_win = attn[:, :, :, :, :, offset : offset + total_win_size**2]
+            out += self.compute_window_pv(attn_win, v)
+            attn = self.ungroup_dots(attn)
+        # attn will be b x h x lw x n_k from now on
+        if self.dp_rank > 0:
+            attn_lm = attn[:, :, :, : self.dp_rank]
+            v_lms = (
+                # c_scores.matmul(v.float())
+                c_scores.matmul(
+                    v
+                )  # Changed when experimenting with mixed precision (Johannes S.)
+                .to(v)
+                .transpose(1, 2)
+                .contiguous()
+                .view(B, self.dp_rank, -1)
+            )
+            v_lms = (
+                self.dual_ln_dp(v_lms)
+                .view(B, self.dp_rank, self.num_heads, -1)
+                .contiguous()
+                .transpose(1, 2)
+            )
+            out += attn_lm.matmul(v_lms)
+        if self.nglo > 0:
+            attn_cls = attn[:, :, :, -self.nglo :]
+            out += attn_cls.matmul(
+                v_cls
+            )  # Changed. Was `.mul` instead of `.matmul`. (JWS)
+            # b x h x 1 x lw
+            cls_inner = q_cls.matmul(k_cls.transpose(-1, -2))
+            cls_dots = q_cls.matmul(
+                k.transpose(-1, -2)
+            )  # Changed. Was `out` instead of `k`. (JWS)
+            cls_dots = torch.cat([cls_inner, cls_dots], dim=-1)
+            # cls_dots = cls_dots.softmax(dim=-1, dtype=torch.float32).to(x)
+            cls_dots = cls_dots.softmax(dim=-1).to(
+                x
+            )  # Changed when experimenting with mixed precision (Johannes S.)
+            cls_next = cls_dots[:, :, :, self.nglo :].matmul(
+                v
+            )  # the post_cls variant # Changed. Was `out` instead of `v`. (JWS)
+            cls_next += cls_dots[:, :, :, : self.nglo].matmul(v_cls)
+            out = torch.cat([cls_next, out], dim=2)
+        out = out.transpose(1, 2).contiguous().view(B, N, -1)
+        # x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        out = self.proj(out)
+        out = self.proj_drop(out)
+        return out
+    def compute_window_scores(self, q, k):
+        """Compute the inner products for the window attention.
+        Frist, divide the query into non-overlapping windows.
+        Then, use torch.as_trided (implemented in self.get_overlapping_tiles) to create a view of the keys
+        that corresponds to the windows with at most 2x memory overhead.
+        Finally, compute the inner product.
+        """
+        # q: b h (l w) d
+        b, h, _, d = q.shape
+        side_size = max(self.window_size // 2, 1)
+        # q_group_size: segment size
+        kv_width = 2 * side_size + self.window_size  # assuming q_stride=1
+        q_n_group = self.img_size // self.window_size
+        q_tiles = q.reshape(
+            b, h, q_n_group, self.window_size, q_n_group, self.window_size, d
+        ).permute(0, 1, 2, 4, 3, 5, 6)
+        # q_tiles: b x h x n_group x n_group x w^2 x d
+        q_tiles = q_tiles.contiguous().view(b, h, q_n_group, q_n_group, -1, d)
+        # k_tiles: b x h x n_group x n_group x 9w^2 x d
+        k_tiles = (
+            self.get_overlapping_tiles(k)
+            .contiguous()
+            .view(b, h, q_n_group, q_n_group, -1, d)
+        )
+        # dot_tiles: b x h x n_group x n_group x w^2 x 9w^2
+        dot_tiles = q_tiles.matmul(k_tiles.transpose(-1, -2))
+        # fill "-inf" into the zero-padding parts
+        dot_tiles = dot_tiles.view(b, h, q_n_group, q_n_group, -1, kv_width, kv_width)
+        dot_tiles[:, :, 0, :, :, :side_size].fill_(float("-inf"))
+        dot_tiles[:, :, -1, :, :, -side_size:].fill_(float("-inf"))
+        dot_tiles[:, :, :, 0, :, :, :side_size].fill_(float("-inf"))
+        dot_tiles[:, :, :, -1, :, :, -side_size:].fill_(float("-inf"))
+        dot_tiles = dot_tiles.view(b, h, q_n_group, q_n_group, -1, kv_width**2)
+        return dot_tiles
+    def get_overlapping_tiles(self, x):
+        """Get overlapping tiles in the 2D spatial domain, ensuring each query computes correlation with all neighbors"""
+        # x: b h (l w) d
+        b, h, _, d = x.shape
+        side_size = max(self.window_size // 2, 1)
+        total_size = 2 * side_size + self.window_size
+        kv_group_size = self.window_size
+        kv_width = self.img_size
+        x = x.view(b, h, kv_width, kv_width, d)
+        x = F.pad(x, [0, 0, side_size, side_size, side_size, side_size], value=0)
+        out_shape = [
+            b,
+            h,
+            kv_width // kv_group_size,
+            kv_width // kv_group_size,
+            total_size,
+            total_size,
+            d,
+        ]
+        in_stride = x.stride()
+        out_stride = [
+            in_stride[0],
+            in_stride[1],
+            in_stride[2] * kv_group_size,
+            in_stride[3] * kv_group_size,
+            in_stride[2],
+            in_stride[3],
+            in_stride[4],
+        ]
+        # note we ignored the boundary here
+        return x.as_strided(size=out_shape, stride=out_stride)
+    def compute_window_pv(self, attn, v):
+        """Compute the inner product of attention matrix and the values for the window attention."""
+        b, h, n_group, _, w2, n_k = attn.shape
+        d = v.shape[-1]
+        v_tiles = (
+            self.get_overlapping_tiles(v)
+            .contiguous()
+            .view(b, h, n_group, n_group, -1, d)
+        )
+        # b x h x n_group x n_group x w^2 x d
+        pv = attn.matmul(v_tiles)
+        # return: b x h x (lw) x d
+        ret = self.ungroup_dots(pv)
+        return ret
+    def group_dots(self, dots):
+        b, h = dots.shape[:2]
+        n_group = self.img_size // self.window_size
+        dots = dots.reshape(
+            b, h, n_group, self.window_size, n_group, self.window_size, -1
+        ).permute(0, 1, 2, 4, 3, 5, 6)
+        dots = dots.contiguous().view(
+            b, h, n_group, n_group, self.window_size * self.window_size, -1
+        )
+        return dots
+    def ungroup_dots(self, dots):
+        b, h, n_group, _, _, n_keys = dots.shape
+        dots = dots.reshape(
+            b, h, n_group, n_group, self.window_size, self.window_size, -1
+        ).permute(0, 1, 2, 4, 3, 5, 6)
+        dots = dots.contiguous().view(b, h, -1, n_keys)
+        return dots

surya/utils/__init__.py ADDED Viewed

File without changes

surya/utils/config.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import os
+from argparse import Namespace
+import yaml
+class DataConfig:
+    def __init__(
+        self,
+        train_data_path: str,
+        valid_data_path: str,
+        batch_size: int,
+        num_data_workers: int,
+        prefetch_factor: int,
+        time_delta_input_minutes: list[int],
+        n_input_timestamps: int | None = None,
+        pooling: int | None = None,
+        random_vert_flip: bool = False,
+        **kwargs,
+    ):
+        self.__dict__.update(kwargs)
+        self.train_data_path = train_data_path
+        self.valid_data_path = valid_data_path
+        self.batch_size = batch_size
+        self.num_data_workers = num_data_workers
+        self.prefetch_factor = prefetch_factor
+        self.time_delta_input_minutes = sorted(time_delta_input_minutes)
+        self.n_input_timestamps = n_input_timestamps
+        self.pooling = pooling
+        self.random_vert_flip = random_vert_flip
+        if self.n_input_timestamps is None:
+            self.n_input_timestamps = len(self.time_delta_input_minutes)
+        assert (
+            self.n_input_timestamps > 0
+        ), "Number of input timestamps must be greater than 0."
+        assert self.n_input_timestamps <= len(self.time_delta_input_minutes), (
+            f"Cannot sample {self.n_input_timestamps} from list of "
+            f"{self.time_delta_input_minutes} input timestamps."
+        )
+    def to_dict(self):
+        return self.__dict__
+    @staticmethod
+    def from_argparse(args: Namespace):
+        return DataConfig(**args.__dict__)
+    def __str__(self):
+        return (
+            f"Training index: {self.train_data_path}, "
+            f"Validation index: {self.valid_data_path}, "
+        )
+    def __repr__(self):
+        return (
+            f"Training index: {self.train_data_path}, "
+            f"Validation index: {self.valid_data_path}, "
+        )
+class ModelConfig:
+    def __init__(
+        self,
+        # enc_num_layers: int,
+        # enc_num_heads: int,
+        # enc_embed_size: int,
+        # dec_num_layers: int,
+        # dec_num_heads: int,
+        # dec_embed_size: int,
+        # mask_ratio: float,
+        **kwargs,
+    ):
+        self.__dict__.update(kwargs)
+        # self.enc_num_layers = enc_num_layers
+        # self.enc_num_heads = enc_num_heads
+        # self.enc_embed_size = enc_embed_size
+        # self.dec_num_layers = dec_num_layers
+        # self.dec_num_heads = dec_num_heads
+        # self.dec_embed_size = dec_embed_size
+        # self.mlp_ratio = 0.0
+        # self.mask_ratio = mask_ratio
+        self.__dict__.update(kwargs)
+    def to_dict(self):
+        return self.__dict__
+    @staticmethod
+    def from_argparse(args: Namespace):
+        return ModelConfig(**args.__dict__)
+    @property
+    def encoder_d_ff(self):
+        return int(self.enc_embed_size * self.mlp_ratio)
+    @property
+    def decoder_d_ff(self):
+        return int(self.dec_embed_size * self.mlp_ratio)
+    def __str__(self):
+        return (
+            f"Input channels: {self.model.in_channels}, "
+            f"Encoder (L, H, E): {[self.enc_num_layers, self.enc_num_heads, self.enc_embed_size]}, "
+            f"Decoder (L, H, E): {[self.dec_num_layers, self.dec_num_heads, self.dec_embed_size]}"
+        )
+    def __repr__(self):
+        return (
+            f"Input channels: {self.model.in_channels}, "
+            f"Encoder (L, H, E): {[self.enc_num_layers, self.enc_num_heads, self.enc_embed_size]}, "
+            f"Decoder (L, H, E): {[self.dec_num_layers, self.dec_num_heads, self.dec_embed_size]}"
+        )
+class OptimizerConfig:
+    def __init__(
+        self,
+        warm_up_steps: int,
+        max_epochs: int,
+        learning_rate: float,
+        min_lr: float,
+    ):
+        self.warm_up_steps = warm_up_steps
+        self.max_epochs = max_epochs
+        self.learning_rate = learning_rate
+        self.min_lr = min_lr
+    def to_dict(self):
+        return self.__dict__
+    @staticmethod
+    def from_argparse(args: Namespace):
+        return ModelConfig(**args.__dict__)
+    def __str__(self):
+        return (
+            f"Epochs: {self.max_epochs}, "
+            f"LR: {[self.learning_rate, self.min_lr]}, "
+            f"Warm up: {self.warm_up_steps},"
+        )
+    def __repr__(self):
+        return (
+            f"Epochs: {self.max_epochs}, "
+            f"LR: {[self.learning_rate, self.min_lr]}, "
+            f"Warm up: {self.warm_up_steps},"
+        )
+class ExperimentConfig:
+    def __init__(
+        self,
+        job_id: str,
+        data_config: DataConfig,
+        model_config: ModelConfig,
+        optimizer_config: OptimizerConfig,
+        path_experiment: str,
+        parallelism: str,
+        from_checkpoint: str | None = None,
+        **kwargs,
+    ):
+        # additional experiment parameters used in downstream tasks
+        self.__dict__.update(kwargs)
+        self.job_id = job_id
+        self.data = data_config
+        self.model = model_config
+        self.optimizer = optimizer_config
+        self.path_experiment = path_experiment
+        self.from_checkpoint = from_checkpoint
+        self.parallelism = parallelism
+        assert self.model.in_channels == len(self.data.channels), (
+            f"Number of model input channels ({self.model.in_channels}) must be "
+            f"equal to number of input variables ({len(self.data.channels)})."
+        )
+        if self.model.time_embedding["type"] == "linear":
+            assert (
+                self.model.time_embedding["time_dim"] == self.data.n_input_timestamps
+            ), "Time dimension of linear embedding must be equal to number of input timestamps."
+        if self.rollout_steps > 0:
+            assert self.data.n_input_timestamps == len(
+                self.data.time_delta_input_minutes
+            ), "Rollout does not support randomly sampled input timestamps."
+        metrics_channels = []
+        for field1, value1 in self.metrics["train_metrics_config"].items():
+            for field2, value2 in self.metrics["train_metrics_config"][field1].items():
+                if field2 == "metrics":
+                    for metric_definition in value2:
+                        split_metric_definition = metric_definition.split(":")
+                        channels = (
+                            split_metric_definition[2]
+                            if len(split_metric_definition) > 2
+                            else None
+                        )
+                        if channels is not None:
+                            metrics_channels = metrics_channels + channels.split("...")
+        for field1, value1 in self.metrics["validation_metrics_config"].items():
+            for field2, value2 in self.metrics["validation_metrics_config"][
+                field1
+            ].items():
+                if field2 == "metrics":
+                    for metric_definition in value2:
+                        split_metric_definition = metric_definition.split(":")
+                        channels = (
+                            split_metric_definition[2]
+                            if len(split_metric_definition) > 2
+                            else None
+                        )
+                        if channels is not None:
+                            metrics_channels = metrics_channels + channels.replace(
+                                "...", "&"
+                            ).split("&")
+        assert set(metrics_channels).issubset(self.data.channels), (
+            f"{set(metrics_channels).difference(self.data.channels)} "
+            f"not part of data input channels."
+        )
+        assert self.parallelism in [
+            "ddp",
+            "fsdp",
+        ], 'Valid choices for `parallelism` are "ddp" and "fsdp".'
+    @property
+    def path_checkpoint(self) -> str:
+        if self.path_experiment == "":
+            return os.path.join(self.path_weights, "train", "checkpoint.pt")
+        else:
+            return os.path.join(
+                os.path.dirname(self.path_experiment),
+                "weights",
+                "train",
+                "checkpoint.pt",
+            )
+    @property
+    def path_weights(self) -> str:
+        return os.path.join(self.path_experiment, self.make_suffix_path(), "weights")
+    @property
+    def path_states(self) -> str:
+        return os.path.join(self.path_experiment, self.make_suffix_path(), "states")
+    def to_dict(self):
+        d = self.__dict__.copy()
+        d["model"] = self.model.to_dict()
+        d["data"] = self.data.to_dict()
+        return d
+    @staticmethod
+    def from_argparse(args: Namespace):
+        return ExperimentConfig(
+            data_config=DataConfig.from_argparse(args),
+            model_config=ModelConfig.from_argparse(args),
+            optimizer_config=OptimizerConfig.from_argparse(args),
+            **args.__dict__,
+        )
+    @staticmethod
+    def from_dict(params: dict):
+        return ExperimentConfig(
+            data_config=DataConfig(**params["data"]),
+            model_config=ModelConfig(**params["model"]),
+            optimizer_config=OptimizerConfig(**params["optimizer"]),
+            **params,
+        )
+    def make_folder_name(self) -> str:
+        param_folder = "wpt-c1-s1"
+        return param_folder
+    def make_suffix_path(self) -> str:
+        return os.path.join(self.job_id)
+    def __str__(self):
+        return (
+            f"ID: {self.job_id}, "
+            f"Epochs: {self.optimizer.max_epochs}, "
+            f"Batch size: {self.data.batch_size}, "
+            f"LR: {[self.optimizer.learning_rate, self.optimizer.min_lr]}, "
+            f"Warm up: {self.optimizer.warm_up_steps},"
+            f"DL workers: {self.data.num_data_workers},"
+            f"Parallelism: {self.parallelism}"
+        )
+    def __repr__(self):
+        return (
+            f"ID: {self.job_id}, "
+            f"Epochs: {self.optimizer.max_epochs}, "
+            f"Batch size: {self.data.batch_size}, "
+            f"LR: {[self.optimizer.learning_rate, self.optimizer.min_lr]}, "
+            f"Warm up: {self.optimizer.warm_up_steps},"
+            f"DL workers: {self.data.num_data_workers},"
+            f"Parallelism: {self.parallelism}"
+        )
+def get_config(
+    config_path: str,
+) -> ExperimentConfig:
+    cfg = yaml.safe_load(open(config_path, "r"))
+    cfg["data"]["scalers"] = yaml.safe_load(open(cfg["data"]["scalers_path"], "r"))
+    return ExperimentConfig.from_dict(params=cfg)

surya/utils/data.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from typing import Dict
+import numpy as np
+import torch
+from surya.datasets.transformations import Transformation, StandardScaler
+from surya.utils.config import DataConfig
+from surya.utils.misc import class_from_name, view_as_windows
+def custom_collate_fn(batch):
+    """
+    Custom collate function for handling batches of data and metadata in a PyTorch DataLoader.
+    This function separately processes the data and metadata from the input batch.
+    - The `data_batch` is collated using PyTorch's `default_collate`. If collation fails due to incompatible data types,
+      the batch is returned as-is.
+    - The `metadata_batch` is assumed to be a dictionary, where each key corresponds to a list of values across the batch.
+      Each key is collated using `default_collate`. If collation fails for a particular key, the original list of values
+      is retained.
+    Example usage for accessing collated metadata:
+        - `collated_metadata['timestamps_input'][batch_idx][input_time]`
+        - `collated_metadata['timestamps_input'][batch_idx][rollout_step]`
+    Args:
+        batch (list of tuples): Each tuple contains (data, metadata), where:
+            - `data` is a tensor or other data structure used for training.
+            - `metadata` is a dictionary containing additional information.
+    Returns:
+        tuple: (collated_data, collated_metadata)
+            - `collated_data`: The processed batch of data.
+            - `collated_metadata`: The processed batch of metadata.
+    """
+    # Unpack batch into separate lists of data and metadata
+    data_batch, metadata_batch = zip(*batch)
+    # Attempt to collate the data batch using PyTorch's default collate function
+    try:
+        collated_data = torch.utils.data.default_collate(data_batch)
+    except TypeError:
+        # If default_collate fails (e.g., due to incompatible types), return the data batch as-is
+        collated_data = data_batch
+    # Handle metadata collation
+    if isinstance(metadata_batch[0], dict):
+        collated_metadata = {}
+        for key in metadata_batch[0].keys():
+            values = [d[key] for d in metadata_batch]
+            try:
+                # Attempt to collate values under the current key
+                collated_metadata[key] = torch.utils.data.default_collate(values)
+            except TypeError:
+                # If collation fails, keep the values as a list
+                collated_metadata[key] = values
+    else:
+        # If metadata is not a dictionary, try to collate it as a whole
+        try:
+            collated_metadata = torch.utils.data.default_collate(metadata_batch)
+        except TypeError:
+            # If collation fails, return metadata as-is
+            collated_metadata = metadata_batch
+    return collated_data, collated_metadata
+def calc_num_windows(raw_size: int, win_size: int, stride: int) -> int:
+    return (raw_size - win_size) // stride + 1
+def get_scalers_info(dataset) -> dict:
+    return {
+        k: (type(v).__module__, type(v).__name__, v.to_dict())
+        for k, v in dataset.scalers.items()
+    }
+def build_scalers_pressure(info: dict) -> Dict[str, Transformation]:
+    ret_dict = {k: dict() for k in info.keys()}
+    for var_key, var_d in info.items():
+        for p_key, p_val in var_d.items():
+            ret_dict[var_key][p_key] = class_from_name(
+                p_val["base"], p_val["class"]
+            ).from_dict(p_val)
+    return ret_dict
+def build_scalers(info: dict) -> Dict[str, Transformation]:
+    ret_dict = {k: None for k in info.keys()}
+    for p_key, p_val in info.items():
+        ret_dict[p_key]: StandardScaler = class_from_name(
+            p_val["base"], p_val["class"]
+        ).from_dict(p_val)
+    return ret_dict
+def break_batch_5d(
+    data: list, lat_size: int, lon_size: int, time_steps: int
+) -> np.ndarray:
+    """
+    data: list of samples, each sample is [C, T, L, H, W]
+    """
+    num_levels = data[0].shape[2]
+    num_vars = data[0].shape[0]
+    big_batch = np.stack(data, axis=0)
+    vw = view_as_windows(
+        big_batch,
+        [1, num_vars, time_steps, num_levels, lat_size, lon_size],
+        step=[1, num_vars, time_steps, num_levels, lat_size, lon_size],
+    ).squeeze()
+    # To check if it is correctly reshaping
+    # idx = 30
+    # (big_batch[0, :, idx:idx+2, :, 40:80, 40:80]-vw[idx//2, 1, 1]).sum()
+    vw = vw.reshape(-1, num_vars, time_steps, num_levels, lat_size, lon_size)
+    # How to test:
+    # (big_batch[0, :, :2, :, :40, :40] - vw[0]).sum()
+    # (big_batch[0, :, :2, :, :40, 40:80] - vw[1]).sum()
+    # (big_batch[0, :, :2, :, 40:80, :40] - vw[2]).sum()
+    # Need to move axis because Weather model is expecting [C, L, T, H, W] instead of [C, T, L, H, W]
+    vw = np.moveaxis(vw, 3, 2)
+    vw = torch.tensor(vw, dtype=torch.float32)
+    return vw
+def break_batch_5d_aug(data: list, cfg: DataConfig, max_batch: int = 256) -> np.ndarray:
+    num_levels = data[0].shape[2]
+    num_vars = data[0].shape[0]
+    big_batch = np.stack(data, axis=0)
+    y_step, x_step, t_step = (
+        cfg.patch_size_lat // 2,
+        cfg.patch_size_lon // 2,
+        cfg.patch_size_time // 2,
+    )
+    y_max = calc_num_windows(big_batch.shape[4], cfg.input_size_lat, y_step)
+    x_max = calc_num_windows(big_batch.shape[5], cfg.input_size_lon, x_step)
+    t_max = calc_num_windows(big_batch.shape[2], cfg.input_size_time, t_step)
+    max_batch = min(max_batch, y_max * x_max * t_max)
+    batch = np.empty(
+        (
+            max_batch,
+            num_vars,
+            cfg.input_size_time,
+            num_levels,
+            cfg.input_size_lat,
+            cfg.input_size_lon,
+        ),
+        dtype=np.float32,
+    )
+    for j, i in enumerate(np.random.permutation(np.arange(max_batch))):
+        t, y, x = np.unravel_index(
+            i,
+            (
+                t_max,
+                y_max,
+                x_max,
+            ),
+        )
+        batch[j] = big_batch[
+            :,  # batch_id
+            :,  # vars
+            t * t_step : t * t_step + cfg.input_size_time,
+            :,  # levels
+            y * y_step : y * y_step + cfg.input_size_lat,
+            x * x_step : x * x_step + cfg.input_size_lon,
+        ]
+    batch = np.moveaxis(batch, 3, 2)
+    batch = torch.tensor(batch, dtype=torch.float32)
+    return batch

surya/utils/distributed.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import os
+import random
+from datetime import timedelta
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed import checkpoint as dist_checkpoint
+from torch.distributed import fsdp
+import functools
+import itertools
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import Dataset
+from typing import Any, Dict, Optional
+from surya.utils.schemas import TrainState
+def init_dist(device: str, rank: int, world_size: int):
+    torch.distributed.init_process_group(
+        device,
+        init_method="env://",
+        world_size=world_size,
+        rank=rank,
+        timeout=timedelta(minutes=60),
+    )
+def init_ddp(use_gpu: bool):
+    local_rank = int(os.environ["LOCAL_RANK"])
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    if use_gpu:
+        assert (
+            torch.cuda.is_available()
+        ), "GPU requested but none was found in the system."
+    if use_gpu:
+        init_dist("nccl", rank, world_size)
+        torch.cuda.set_device(local_rank)
+        os.environ["TORCH_SHOW_CPP_STACKTRACES"] = str(1)
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = str(1)
+        os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
+        cudnn.benchmark = True
+    else:
+        init_dist("gloo", rank, world_size)
+    return local_rank, rank
+def set_global_seed(rank):
+    random.seed(42 + rank)
+    torch.cuda.manual_seed(42 + rank)
+    torch.manual_seed(42 + rank)
+    np.random.seed(42 + rank)
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+# def save_model_singular(model, *args, **kwargs):
+#     """Stream all model parameters to rank 0 on the CPU, then pass all
+#     other given arguments to `torch.save` to save the model, but only on
+#     the root process.
+#     """
+#     save_policy = fsdp.FullStateDictConfig(
+#         offload_to_cpu=True, rank0_only=True)
+#     with fsdp.FullyShardedDataParallel.state_dict_type(
+#             model,
+#             fsdp.StateDictType.FULL_STATE_DICT,
+#             save_policy,
+#     ):
+#         cpu_state = model.state_dict()
+#     # We do *not* want to write to the same location with multiple
+#     # processes at the same time.
+#     if is_root_process():
+#         torch.save(cpu_state, *args, **kwargs)
+def save_model(model, save_dir):
+    """Obtain sharded model parameters from the GPU, then save the model
+    as a distributed checkpoint to the given directory. Saving a
+    distributed checkpoint means that the checkpoint will be split into
+    individual files, one for each process.
+    """
+    state_dict_config = fsdp.ShardedStateDictConfig(offload_to_cpu=False)
+    with fsdp.FullyShardedDataParallel.state_dict_type(
+        model,
+        fsdp.StateDictType.SHARDED_STATE_DICT,
+        state_dict_config,
+    ):
+        cp_state_dict = {"model": model.state_dict()}
+    dist_checkpoint.save_state_dict(
+        cp_state_dict,
+        dist_checkpoint.FileSystemWriter(save_dir),
+    )
+def load_model(model, load_dir):
+    """Set the given model's state dictionary in-place from the given
+    distributed checkpoint directory.
+    """
+    state_dict_config = fsdp.ShardedStateDictConfig(offload_to_cpu=False)
+    with fsdp.FullyShardedDataParallel.state_dict_type(
+        model,
+        fsdp.StateDictType.SHARDED_STATE_DICT,
+        state_dict_config,
+    ):
+        cp_state_dict = {"model": model.state_dict()}
+    dist_checkpoint.load_state_dict(
+        cp_state_dict,
+        dist_checkpoint.FileSystemReader(load_dir),
+    )
+    model.load_state_dict(cp_state_dict["model"])
+@functools.lru_cache(maxsize=None)
+def is_root_process():
+    """Return whether this process is the root process."""
+    return torch.distributed.get_rank() == 0
+# The reason we define this is that `torch.distributed` does not
+# implement it; for the global rank, there's
+# `torch.distributed.get_rank()`.
+@functools.lru_cache(maxsize=None)
+def get_local_rank():
+    """Return the local rank of this process."""
+    return int(os.getenv("LOCAL_RANK"))
+def print0(*args, **kwargs):
+    """Print something only on the root process."""
+    if (not dist.is_initialized()) or is_root_process():
+        print(*args, **kwargs)
+def save_model_singular(model, save_path, parallelism, *args, **kwargs):
+    """Stream all model parameters to rank 0 on the CPU, then pass all
+    other given arguments to `torch.save` to save the model, but only on
+    the root process.
+    """
+    match parallelism:
+        case "fsdp":
+            save_policy = fsdp.FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+            with fsdp.FullyShardedDataParallel.state_dict_type(
+                model,
+                fsdp.StateDictType.FULL_STATE_DICT,
+                save_policy,
+            ):
+                cpu_state = model.state_dict()
+            # We do *not* want to write to the same location with multiple
+            # processes at the same time.
+            if is_main_process():
+                if not os.path.exists(os.path.dirname(save_path)):
+                    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+                torch.save(obj=cpu_state, f=save_path, *args, **kwargs)
+        case "ddp":
+            if is_main_process():
+                torch.save(obj=model.module.state_dict(), f=save_path, *args, **kwargs)
+            dist.barrier()
+        case _:
+            raise ValueError(
+                f'`parallelism` should be one of "ddp" and "fsdp". Got {parallelism}.'
+            )
+def save_optim_singular(
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    save_path: str,
+    parallelism: str = "fsdp",
+):
+    match parallelism:
+        case "fsdp":
+            optim_state_dict_config = fsdp.FullOptimStateDictConfig(
+                offload_to_cpu=True, rank0_only=True
+            )
+            with fsdp.FullyShardedDataParallel.state_dict_type(
+                model,
+                fsdp.StateDictType.FULL_STATE_DICT,
+                optim_state_dict_config=optim_state_dict_config,
+            ):
+                optim_state_dict = fsdp.FullyShardedDataParallel.optim_state_dict(
+                    model, optimizer
+                )
+                if is_main_process():
+                    if not os.path.exists(os.path.dirname(save_path)):
+                        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+                    checkpoint = {
+                        "optimizer_state_dict": optim_state_dict,
+                    }
+                    torch.save(checkpoint, f=save_path)
+        case "ddp":
+            if is_main_process():
+                optim_state_dict = optimizer.state_dict()
+                if not os.path.exists(os.path.dirname(save_path)):
+                    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+                torch.save(obj=optim_state_dict, f=save_path)
+            dist.barrier()
+        case _:
+            raise ValueError(
+                f'`parallelism` should be one of "ddp" and "fsdp". Got {parallelism}.'
+            )
+def collect_optim_singular(
+    model: nn.Module, optimizer: torch.optim.Optimizer, parallelism: str = "fsdp"
+) -> dict:
+    optim_state_dict = {}
+    match parallelism:
+        case "fsdp":
+            optim_state_dict_config = fsdp.FullOptimStateDictConfig(
+                offload_to_cpu=True, rank0_only=True
+            )
+            with fsdp.FullyShardedDataParallel.state_dict_type(
+                model,
+                fsdp.StateDictType.FULL_STATE_DICT,
+                optim_state_dict_config=optim_state_dict_config,
+            ):
+                optim_state_dict = fsdp.FullyShardedDataParallel.optim_state_dict(
+                    model, optimizer
+                )
+        case "ddp":
+            if is_main_process():
+                optim_state_dict = optimizer.state_dict()
+            dist.barrier()
+        case _:
+            raise ValueError(
+                f'`parallelism` should be one of "ddp" and "fsdp". Got {parallelism}.'
+            )
+    return optim_state_dict
+def save_state_singular(states: TrainState, save_path, *args, **kwargs):
+    """Stream all model parameters to rank 0 on the CPU, then pass all
+    other given arguments to `torch.save` to save paramters, but only on
+    the root process.
+    """
+    if is_main_process():
+        if not os.path.exists(os.path.dirname(save_path)):
+            os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        torch.save(obj=states, f=save_path, *args, **kwargs)
+    dist.barrier()
+class StatefulDistributedSampler(DistributedSampler):
+    _YIELDED = "yielded"
+    def __init__(
+        self,
+        dataset: Dataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+    ) -> None:
+        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
+        self.yielded = 0
+        self.next_yielded = None
+    def __iter__(self):
+        self.yielded = 0
+        if self.next_yielded is not None:
+            self.yielded = self.next_yielded
+            self.next_yielded = None
+        it = super().__iter__()
+        for idx in itertools.islice(it, self.yielded, None):
+            self.yielded += 1
+            yield idx
+    def state_dict(self) -> Dict[str, Any]:
+        return {self._YIELDED: self.yielded}
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        if self._YIELDED not in state_dict:
+            raise ValueError("Invalid state_dict")
+        if state_dict[self._YIELDED] < 0:
+            raise ValueError("Cannot load state_dict with negative yielded value")
+        self.next_yielded = state_dict[self._YIELDED]

surya/utils/log.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import functools
+import logging
+import os
+import sys
+from time import time
+from packaging.version import Version
+import wandb
+from typing import Dict, Optional, Any
+if Version(wandb.__version__) < Version("0.20.0"):
+    WANDB_USE_SYNC = True
+else:
+    WANDB_USE_SYNC = False
+def log(
+    run,
+    data: Dict[str, Any],
+    step: Optional[int] = None,
+    commit: Optional[bool] = None,
+    sync: Optional[bool] = None,
+) -> None:
+    if run is not None:
+        # Note: wandb changed the .log API with version 0.20.0.
+        # This includes: "Removed no-op sync argument from wandb.Run::log function"
+        # We didn't test whether sync has any function here. But since we did
+        # all our development with it, let's keep it here for now.
+        # See https://github.com/wandb/wandb/releases/tag/v0.20.0
+        if WANDB_USE_SYNC:
+            run.log(data, step, commit, sync)
+        else:
+            run.log(data, step, commit)
+    else:
+        print(data)
+# See: https://github.com/microsoft/Swin-Transformer/blob/main/logger.py
+# See: https://github.com/Meituan-AutoML/Twins/blob/main/logger.py
+def create_logger(output_dir: str, dist_rank: int, name: str) -> logging.Logger:
+    # create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    # create formatter
+    fmt = "[%(asctime)s %(name)s]: %(levelname)s %(message)s"
+    # create console handlers
+    if name.endswith("main"):
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(logging.INFO)
+        console_handler.setFormatter(
+            logging.Formatter(fmt=fmt, datefmt="%Y-%m-%d %H:%M:%S")
+        )
+        logger.addHandler(console_handler)
+    # create file handlers
+    file_handler = logging.FileHandler(
+        os.path.join(output_dir, f"{name}.log"), mode="a"
+    )
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt="%Y-%m-%d %H:%M:%S"))
+    logger.addHandler(file_handler)
+    return logger
+def log_decorator(logger, _func=None):
+    def log_decorator_info(func):
+        @functools.wraps(func)
+        def log_decorator_wrapper(*args, **kwargs):
+            """Create a list of the positional arguments passed to function.
+            - Using repr() for string representation for each argument. repr() is similar to str() only
+            difference being it prints with a pair of quotes and if we calculate a value we get more
+            precise value than str().
+            """
+            # py_file_caller = getframeinfo(stack()[1][0])
+            local_rank = os.environ.get("LOCAL_RANK", default=None)
+            rank = os.environ.get("LOCAL_RANK", default=None)
+            try:
+                """log return value from the function"""
+                start_time = time()
+                value = func(*args, **kwargs)
+                if local_rank is None or rank is None:
+                    logger.info(
+                        f"Function '{func.__name__}' - Execution time: {(time() - start_time):.1f} seconds."
+                    )
+                else:
+                    logger.info(
+                        f"Function '{func.__name__}' - Execution time: {(time() - start_time):.1f} "
+                        f"seconds on rank {os.environ['RANK']} and local_rank {os.environ['LOCAL_RANK']}."
+                    )
+            except Exception as err:
+                logger.error(f"Exception: {err}")
+                raise
+            return value
+        # Return the pointer to the function
+        return log_decorator_wrapper
+    # Decorator was called with arguments, so return a decorator function that can read and return a function
+    if _func is None:
+        return log_decorator_info
+    # Decorator was called without arguments, so apply the decorator to the function immediately
+    else:
+        return log_decorator_info(_func)

surya/utils/misc.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import numbers
+from logging import Logger
+from time import time
+import numpy as np
+import torch
+from numpy.lib.stride_tricks import as_strided
+from torch.utils.data import DataLoader
+def view_as_windows(arr_in: np.ndarray, window_shape, step=1) -> np.ndarray:
+    """Rolling window view of the input n-dimensional array.
+    Windows are overlapping views of the input array, with adjacent windows
+    shifted by a single row or column (or an index of a higher dimension).
+    Ref: https://github.com/scikit-image/scikit-image/blob/5e74a4a3a5149a8a14566b81a32bb15499aa3857/skimage/util/shape.py#L97-L247
+    Parameters
+    """
+    # -- basic checks on arguments
+    if not isinstance(arr_in, np.ndarray):
+        raise TypeError("`arr_in` must be a numpy ndarray")
+    ndim = arr_in.ndim
+    if isinstance(window_shape, numbers.Number):
+        window_shape = (window_shape,) * ndim
+    if not (len(window_shape) == ndim):
+        raise ValueError("`window_shape` is incompatible with `arr_in.shape`")
+    if isinstance(step, numbers.Number):
+        if step < 1:
+            raise ValueError("`step` must be >= 1")
+        step = (step,) * ndim
+    if len(step) != ndim:
+        raise ValueError("`step` is incompatible with `arr_in.shape`")
+    arr_shape = np.array(arr_in.shape)
+    window_shape = np.array(window_shape, dtype=arr_shape.dtype)
+    if ((arr_shape - window_shape) < 0).any():
+        raise ValueError("`window_shape` is too large")
+    if ((window_shape - 1) < 0).any():
+        raise ValueError("`window_shape` is too small")
+    # -- build rolling window view
+    slices = tuple(slice(None, None, st) for st in step)
+    window_strides = np.array(arr_in.strides)
+    indexing_strides = arr_in[slices].strides
+    win_indices_shape = (
+        (np.array(arr_in.shape) - np.array(window_shape)) // np.array(step)
+    ) + 1
+    new_shape = tuple(list(win_indices_shape) + list(window_shape))
+    strides = tuple(list(indexing_strides) + list(window_strides))
+    arr_out = as_strided(arr_in, shape=new_shape, strides=strides)
+    return arr_out
+def class_from_name(module_name: str, class_name: str) -> object:
+    # load the module, will raise ImportError if module cannot be loaded
+    m = __import__(module_name, globals(), locals(), [class_name])
+    # get the class, will raise AttributeError if class cannot be found
+    c = getattr(m, class_name)
+    return c
+@torch.no_grad()
+def throughput(data_loader: DataLoader, model: torch.nn.Module, logger: Logger):
+    model.eval()
+    for idx, (images, _) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        batch_size = images.shape[0]
+        for i in range(50):
+            model(images)
+        torch.cuda.synchronize()
+        logger.info("throughput averaged with 30 times")
+        tic1 = time()
+        for i in range(30):
+            model(images)
+        torch.cuda.synchronize()
+        tic2 = time()
+        logger.info(
+            f"batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}"
+        )

surya/utils/schemas.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from typing import TypedDict, Dict, Any
+import torch
+class TrainState(TypedDict):
+    dataloader: torch.utils.data.DataLoader
+    optimizer: Dict[str, Any]
+    scheduler: Dict[str, Any]
+    sampler: Any  # Changed from torch.utils.data.sampler to Any
+    profiler: bool
+    epoch: int
+    iteration: int
+    loss: float
+    wandb_state: int