import torch def expand_t(t, x): """Function to reshape time t to broadcastable dimension of x Args: t: [bsz,], time vector x: [bsz,...], data point """ dims = [1] * (len(x.size()) - 1) t = t.view(t.size(0), *dims) return t def randn_tensor(shape, noise_repeat, device, dtype=torch.float32): bsz = shape[0] if bsz % noise_repeat != 0: raise ValueError(f"Batch size ({bsz}) must be divisible by noise repeat ({noise_repeat})") _shape = (noise_repeat,) + shape[1:] _tensor = torch.randn(_shape, device=device, dtype=dtype).repeat(bsz // noise_repeat, 1) return _tensor def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2 :] return torch.cat((-x2, x1), dim=-1) def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): cos = cos.unsqueeze(unsqueeze_dim) sin = sin.unsqueeze(unsqueeze_dim) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) """ batch, num_key_value_heads, slen, head_dim = hidden_states.shape if n_rep == 1: return hidden_states hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) def identity(input: torch.Tensor, *args, **kwargs) -> torch.Tensor: return input def rms_norm( input: torch.Tensor, normalized_shape: torch.Size, eps: float = 1e-6, ) -> torch.Tensor: dtype = input.dtype input = input.to(torch.float32) variance = input.flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)] input = input * torch.rsqrt(variance + eps) return input.to(dtype) def layer_norm( input: torch.Tensor, normalized_shape: torch.Size, eps: float = 1e-6, ) -> torch.Tensor: dtype = input.dtype input = input.to(torch.float32) mean = input.flatten(-len(normalized_shape)).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)] variance = (input - mean).flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)] input = (input - mean) * torch.rsqrt(variance + eps) return input.to(dtype)