Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
def expand_t(t, x): | |
"""Function to reshape time t to broadcastable dimension of x | |
Args: | |
t: [bsz,], time vector | |
x: [bsz,...], data point | |
""" | |
dims = [1] * (len(x.size()) - 1) | |
t = t.view(t.size(0), *dims) | |
return t | |
def randn_tensor(shape, noise_repeat, device, dtype=torch.float32): | |
bsz = shape[0] | |
if bsz % noise_repeat != 0: | |
raise ValueError(f"Batch size ({bsz}) must be divisible by noise repeat ({noise_repeat})") | |
_shape = (noise_repeat,) + shape[1:] | |
_tensor = torch.randn(_shape, device=device, dtype=dtype).repeat(bsz // noise_repeat, 1) | |
return _tensor | |
def rotate_half(x): | |
"""Rotates half the hidden dims of the input.""" | |
x1 = x[..., : x.shape[-1] // 2] | |
x2 = x[..., x.shape[-1] // 2 :] | |
return torch.cat((-x2, x1), dim=-1) | |
def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): | |
cos = cos.unsqueeze(unsqueeze_dim) | |
sin = sin.unsqueeze(unsqueeze_dim) | |
q_embed = (q * cos) + (rotate_half(q) * sin) | |
k_embed = (k * cos) + (rotate_half(k) * sin) | |
return q_embed, k_embed | |
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: | |
""" | |
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, | |
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) | |
""" | |
batch, num_key_value_heads, slen, head_dim = hidden_states.shape | |
if n_rep == 1: | |
return hidden_states | |
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) | |
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) | |
def identity(input: torch.Tensor, *args, **kwargs) -> torch.Tensor: | |
return input | |
def rms_norm( | |
input: torch.Tensor, | |
normalized_shape: torch.Size, | |
eps: float = 1e-6, | |
) -> torch.Tensor: | |
dtype = input.dtype | |
input = input.to(torch.float32) | |
variance = input.flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)] | |
input = input * torch.rsqrt(variance + eps) | |
return input.to(dtype) | |
def layer_norm( | |
input: torch.Tensor, | |
normalized_shape: torch.Size, | |
eps: float = 1e-6, | |
) -> torch.Tensor: | |
dtype = input.dtype | |
input = input.to(torch.float32) | |
mean = input.flatten(-len(normalized_shape)).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)] | |
variance = (input - mean).flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)] | |
input = (input - mean) * torch.rsqrt(variance + eps) | |
return input.to(dtype) |