Sin2pi
/

asr-model

@@ -296,815 +296,3 @@ MaxFactor is a custom PyTorch optimizer with adaptive learning rates and special
 ** this model deviates in a lot of ways from standard transformer models.
-```python
-import os
-import math
-import warnings
-import logging
-from itertools import chain
-import torch
-import torch.nn.functional as F
-from torch import nn, Tensor
-from tensordict import TensorDict
-from typing import Optional, Dict, Union, List, Tuple
-import numpy as np
-from functools import partial
-from datetime import datetime
-from tensordict import TensorDict
-from transformers.trainer_seq2seq import Seq2SeqTrainer
-from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
-from echoutils import *
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-dtype = torch.float32
-warnings.filterwarnings("ignore")
-logging.basicConfig(level=logging.ERROR)
-class rotary(nn.Module):
-    def __init__(self, dims, head, max_ctx=1500, radii=False, debug: List[str] = [], use_pbias=False, axial=False, spec_shape=None):
-        super(rotary, self).__init__()
-        self.use_pbias = use_pbias
-        self.dims = dims
-        self.head = head
-        self.head_dim = dims // head
-        self.radii = radii
-        self.debug = debug
-        self.counter = 0
-        self.last_theta = None
-        self.axial = axial
-        self.bias = nn.Parameter(torch.zeros(max_ctx, dims // 2), requires_grad=True if use_pbias else False)
-        theta = (torch.tensor(10000, device=device, dtype=dtype))
-        self.theta = nn.Parameter(theta, requires_grad=True)
-        self.theta_values = []
-        if axial and spec_shape is not None:
-            time_frames, freq_bins = spec_shape
-            self.time_frames = time_frames
-            self.freq_bins = freq_bins
-            time_theta = 50.0
-            time_freqs = 1.0 / (time_theta ** (torch.arange(0, dims, 4)[:(dims // 4)].float() / dims))
-            self.register_buffer('time_freqs', time_freqs)
-            freq_theta = 100.0
-            freq_freqs = 1.0 / (freq_theta ** (torch.arange(0, dims, 4)[:(dims // 4)].float() / dims))
-            self.register_buffer('freq_freqs', freq_freqs)
-    def pitch_bias(self, f0):
-        if f0 is None:
-            return None
-        f0_flat = f0.squeeze().float()
-        f0_norm = (f0_flat - f0_flat.mean()) / (f0_flat.std() + 1e-8)
-        f0_sim = torch.exp(-torch.cdist(f0_norm.unsqueeze(1),
-                                    f0_norm.unsqueeze(1)))
-        return f0_sim.unsqueeze(0).unsqueeze(0)
-    def theta_freqs(self, theta):
-        if theta.dim() == 0:
-            theta = theta.unsqueeze(0)
-        freq = (theta.unsqueeze(-1) / 220.0) * 700 * (
-            torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)),
-                    self.head_dim // 2, device=theta.device, dtype=theta.dtype) / 2595) - 1) / 1000
-        return freq
-    def _apply_radii(self, freqs, f0, ctx):
-        if self.radii and f0 is not None:
-            radius = f0.to(device, dtype)
-            L = radius.shape[0]
-            if L != ctx:
-                F = L / ctx
-                idx = torch.arange(ctx, device=f0.device)
-                idx = (idx * F).long().clamp(0, L - 1)
-                radius = radius[idx]
-                return torch.polar(radius.unsqueeze(-1), freqs), radius
-            else:
-                return torch.polar(radius.unsqueeze(-1), freqs), radius
-        else:
-            return torch.polar(torch.ones_like(freqs), freqs), None
-    def check_f0(self, f0, f0t, ctx):
-        if f0 is not None and f0.shape[1] == ctx:
-            return f0
-        elif f0t is not None and f0t.shape[1] == ctx:
-            return f0t
-        else:
-            return None
-    def axial_freqs(self, ctx):
-        if not self.axial:
-            return None
-        time_frames = self.time_frames
-        freq_bins = self.freq_bins
-        t = torch.arange(ctx, device=device, dtype=dtype)
-        t_x = (t % time_frames).float()
-        t_y = torch.div(t, time_frames, rounding_mode='floor').float()
-        freqs_x = torch.outer(t_x, self.time_freqs)
-        freqs_y = torch.outer(t_y, self.freq_freqs)
-        freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
-        freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
-        return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
-    def forward(self, x=None, en=None, f=None, layer=None) -> Tensor:
-        ctx=x
-        f0 = en.get("f0") if en is not None else None
-        f0t = en.get("f0t") if en is not None else None
-        f0 = self.check_f0(f0, f0t, ctx)
-        if f0 is not None:
-            if f0.dim() == 2:
-                f0 = f0.squeeze(0)
-            theta = f0 + self.theta
-        else:
-            theta = self.theta
-        freqs = self.theta_freqs(theta)
-        t = torch.arange(ctx, device=device, dtype=dtype)
-        freqs = t[:, None] * freqs
-        freqs, radius = self._apply_radii(freqs, f0, ctx)
-        if self.axial and f == "spectrogram":
-            freqs_2d = self.axial_freqs(ctx)
-            if freqs_2d is not None:
-                return freqs_2d.unsqueeze(0)
-        if "radius" in self.debug and self.counter == 10:
-            print(f"  [{layer}] [Radius] {radius.shape if radius is not None else None} {radius.mean() if radius is not None else None} [Theta] {theta.mean() if theta is not None else None} [f0] {f0.shape if f0 is not None else None} [Freqs] {freqs.shape} {freqs.mean():.2f} [ctx] {ctx}")
-        self.counter += 1
-        return freqs.unsqueeze(0)
-    @staticmethod
-    def apply_rotary(x, freqs):
-        x1 = x[..., :freqs.shape[-1]*2]
-        x2 = x[..., freqs.shape[-1]*2:]
-        orig_shape = x1.shape
-        if x1.ndim == 2:
-            x1 = x1.unsqueeze(0)
-        x1 = x1.float().reshape(*x1.shape[:-1], -1, 2).contiguous()
-        x1 = torch.view_as_complex(x1) * freqs
-        x1 = torch.view_as_real(x1).flatten(-2)
-        x1 = x1.view(orig_shape)
-        return torch.cat([x1.type_as(x), x2], dim=-1)
-class MultiheadA(nn.Module):
-    rbf = False
-    def __init__(self, dims: int, head: int, rotary_emb: bool = True,
-                 zero_val: float = 1e-7, minz: float = 1e-8, maxz: float = 1e-6, debug: List[str] = [], optim_attn=False, use_pbias=False):
-        super(MultiheadA, self).__init__()
-        self.dims = dims
-        self.head = head
-        self.head_dim = dims // head
-        self.debug = debug
-        self.counter = 0
-        self.use_pbias = use_pbias
-        self.q = nn.Linear(dims, dims).to(device, dtype)
-        self.k = nn.Linear(dims, dims, bias=False).to(device, dtype)
-        self.v = nn.Linear(dims, dims).to(device, dtype)
-        self.o = nn.Linear(dims, dims).to(device, dtype)
-        self.pad_token = 0
-        self.rotary_emb = rotary_emb
-        self.minz = minz
-        self.maxz = maxz
-        self.zero_val = zero_val
-        self.optim_attn = optim_attn
-        self.fzero = nn.Parameter(torch.tensor(zero_val, device=device, dtype=dtype), requires_grad=False)
-        if rotary_emb:
-            self.rope = rotary(
-                dims=dims,
-                head=head,
-                debug=debug,
-                radii=False,
-                )
-        else:
-            self.rope = None
-    def cos_sim(self, q: Tensor, k: Tensor, v: Tensor, mask) -> Tensor:
-        q_norm = torch.nn.functional.normalize(q, dim=-1, eps=1e-12)
-        k_norm = torch.nn.functional.normalize(k, dim=-1, eps=1e-12)
-        qk_cosine = torch.matmul(q_norm, k_norm.transpose(-1, -2))
-        qk_cosine = qk_cosine + mask
-        weights = F.softmax(qk_cosine, dim=-1)
-        out = torch.matmul(weights, v)
-        return out
-    def rbf_scores(self, q, k, rbf_sigma=1.0, rbf_ratio=0.0):
-        scale = (self.dims // self.head) ** -0.25
-        dot_scores = torch.matmul(q, k.transpose(-1, -2)) * scale
-        if rbf_ratio <= 0.0:
-            return dot_scores
-        q_norm = q.pow(2).sum(dim=-1, keepdim=True)
-        k_norm = k.pow(2).sum(dim=-1, keepdim=True)
-        qk = torch.matmul(q, k.transpose(-1, -2))
-        dist_sq = q_norm + k_norm.transpose(-1, -2) - 2 * qk
-        rbf_scores = torch.exp(-dist_sq / (2 * rbf_sigma**2))
-        return (1 - rbf_ratio) * dot_scores + rbf_ratio * rbf_scores
-    def forward(self, x: Tensor, xa = None, mask = None, en= None, layer = None, f=None) -> tuple:
-        x = x.to(device, dtype)
-        if xa is not None:
-            xa = xa.to(device, dtype)
-        scale = (self.dims // self.head) ** -0.25
-        z = default(xa, x).to(device, dtype)
-        q = self.q(x)
-        k = self.k(z)
-        v = self.v(z)
-        if self.rotary_emb:
-            q = q.view(*q.shape[:2], self.head, -1).permute(0, 2, 1, 3)
-            k = k.view(*k.shape[:2], self.head, -1).permute(0, 2, 1, 3)
-            v = v.view(*v.shape[:2], self.head, -1).permute(0, 2, 1, 3)
-            q2 = q.shape[2]
-            k2 = k.shape[2]
-            q = self.rope.apply_rotary(q, (self.rope(x=q2, en=en, f=f, layer=layer)))
-            k = self.rope.apply_rotary(k, (self.rope(x=k2, en=en, f=f, layer=layer)))
-        else:
-            q = q.view(*q.shape[:2], self.head, -1).permute(0, 2, 1, 3)
-            k = k.view(*k.shape[:2], self.head, -1).permute(0, 2, 1, 3)
-            v = v.view(*v.shape[:2], self.head, -1).permute(0, 2, 1, 3)
-        qk = (q * scale) @ (k * scale).transpose(-1, -2)
-        if self.rbf:
-            qk = self.rbf_scores(q * scale, k * scale, rbf_sigma=1.0, rbf_ratio=0.3)
-        if self.use_pbias:
-            pbias = self.rope.pitch_bias(f0 = en.get("f0", None) if en is not None else None)
-            if pbias is not None:
-                qk = qk + pbias[:,:,:q2,:q2]
-        token_ids = k[:, :, :, 0]
-        zscale = torch.ones_like(token_ids)
-        fzero = torch.clamp(F.softplus(self.fzero), self.minz, self.maxz)
-        zscale[token_ids.float() == self.pad_token] = fzero
-        if mask is not None:
-            if mask.dim() == 4:
-                mask = mask[0, 0]
-            mask = mask[:q2, :k2] if xa is not None else mask[:q2, :q2]
-            qk = qk + mask * zscale.unsqueeze(-2).expand(qk.shape)
-        qk = qk * zscale.unsqueeze(-2)
-        w = F.softmax(qk, dim=-1).to(q.dtype)
-        wv = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
-        if "multihead" in self.debug and self.counter % 100 == 0:
-            print(f"MHA: q={q.shape}, k={k.shape}, v={v.shape} - {qk.shape}, wv shape: {wv.shape}")
-        self.counter += 1
-        return self.o(wv), qk
-    @staticmethod
-    def split(X: Tensor) -> (Tensor, Tensor):
-        half_dim = X.shape[-1] // 2
-        return X[..., :half_dim], X[..., half_dim:]
-class t_gate(nn.Module):
-    def __init__(self, dims, num_types=4, enabled=True):
-        super().__init__()
-        self.enabled = enabled
-        self.gate_projections = nn.ModuleList([
-            nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-            for _ in range(num_types)])
-        self.type_classifier = nn.Sequential(
-            Linear(dims, num_types),
-            nn.Softmax(dim=-1))
-    def forward(self, x):
-        if not self.enabled:
-            return None
-        type_probs = self.type_classifier(x)
-        gates = torch.stack([gate(x) for gate in self.gate_projections], dim=-1)
-        comb_gate = torch.sum(gates * type_probs.unsqueeze(2), dim=-1)
-        return comb_gate
-class m_gate(nn.Module):
-    def __init__(self, dims, mem_size=64, enabled=True):
-        super().__init__()
-        self.enabled = enabled
-        if enabled:
-            self.m_key = nn.Parameter(torch.randn(mem_size, dims))
-            self.m_val = nn.Parameter(torch.randn(mem_size, 1))
-            self.gate_proj = nn.Sequential(Linear(dims, dims//2), nn.SiLU(), Linear(dims//2, 1))
-    def forward(self, x):
-        if not self.enabled:
-            return None
-        d_gate = torch.sigmoid(self.gate_proj(x))
-        attention = torch.matmul(x, self.m_key.transpose(0, 1))
-        attention = F.softmax(attention / math.sqrt(x.shape[-1]), dim=-1)
-        m_gate = torch.matmul(attention, self.m_val)
-        m_gate = torch.sigmoid(m_gate)
-        return 0.5 * (d_gate + m_gate)
-class c_gate(nn.Module):
-    def __init__(self, dims, enabled=True):
-        super().__init__()
-        self.enabled = enabled
-        if enabled:
-            self.s_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-            self.w_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-            self.p_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-            self.e_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-            self.ph_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-            self.integ = Linear(dims*5, dims)
-    def forward(self, x, features):
-        if not self.enabled:
-            return None
-        s_feat = features.get("spectrogram", x)
-        w_feat = features.get("waveform", x)
-        p_feat = features.get("pitch", x)
-        e_feat = features.get("envelope", x)
-        ph_feat = features.get("phase", x)
-        s = self.s_gate(x) * s_feat
-        w = self.w_gate(x) * w_feat
-        p = self.p_gate(x) * p_feat
-        e = self.e_gate(x) * e_feat
-        ph = self.ph_gate(x) * ph_feat
-        comb = torch.cat([s, w, p, e, ph], dim=-1)
-        return self.integ(comb)
-class mlp_gate(nn.Module):
-    def __init__(self, dims, head, enabled=True, one_shot=True):
-        super().__init__()
-        self.enabled = enabled
-        if enabled:
-            self.gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-    def forward(self, x, xa=None, f=None):
-        if not self.enabled:
-            return None
-        return self.gate(x)
-class Residual(nn.Module):
-    _seen = set()
-    def __init__(self, ctx, dims, head, act, debug: List[str] = [],
-                 tgate=True, mgate=False, cgate=False, mem_size=512, features=None, one_shot=False):
-        super().__init__()
-        self.dims = dims
-        self.head = head
-        self.ctx = ctx
-        self.head_dim = dims // head
-        self.features = features
-        self.debug = debug
-        self.counter = 0
-        self.dropout = 0.01
-        self.one_shot = one_shot
-        self.blend = nn.Parameter(torch.tensor(0.5))
-        act_fn = get_activation(act)
-        self.attn = MultiheadA(dims, head, rotary_emb=True, debug=debug)
-        self.curiosity = curiosity(dims, head)
-        if not any([tgate, mgate, cgate]):
-            self.mlp_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-        else:
-            self.mlp_gate = None
-        mlp = dims * 4
-        self.mlp = nn.Sequential(Linear(dims, mlp), act_fn, Linear(mlp, dims))
-        self.t_gate = t_gate(dims=dims, num_types=4*2, enabled=tgate)
-        self.m_gate = m_gate(dims=dims, mem_size=mem_size, enabled=mgate)
-        self.c_gate = c_gate(dims=dims, enabled=cgate)
-        self.mlp_gate = mlp_gate(dims=dims, head=head, enabled=not any([tgate, mgate, cgate]), one_shot=True)
-        self.lna = RMSNorm(dims)
-        self.lnb = RMSNorm(dims)
-        self.lnc = RMSNorm(dims)
-    def forward(self, x, xa=None, mask=None, en=None, layer=None, f=None) -> Tensor:
-        b = torch.sigmoid(self.blend)
-        ax = x + self.attn(self.lna(x), xa=xa, mask=mask, en=en, layer=layer, f=f)[0]
-        bx = b * ax + (1 - b) * x
-        cx = self.lnb(bx)
-        dx = self.mlp(cx)
-        ex = self.t_gate(cx) if not None else self.default(self.m_gate(cx), self.mlp_gate(cx))
-        fx = x + ex + dx
-        gx = self.lnc(fx)
-        return gx
-class OneShot(nn.Module):
-    def __init__(self, dims: int, head: int, scale: float = 0.3):
-        super().__init__()
-        self.head  = head
-        self.hdim  = dims // head
-        self.scale = scale
-        self.q_proj = Linear(dims, dims)
-        self.k_proj = Linear(dims, dims)
-    def forward(self, x: Tensor, guide: Tensor, f=None) -> Tensor | None:
-        B, Q, _ = x.shape
-        K       = guide.size(1)
-        q = self.q_proj(x ).view(B, Q, self.head, self.hdim).transpose(1,2)
-        k = self.k_proj(guide).view(B, K, self.head, self.hdim).transpose(1,2)
-        bias = (q @ k.transpose(-1, -2)) * self.scale / math.sqrt(self.hdim)
-        return bias
-class curiosity(nn.Module):
-    def __init__(self, d, h, bias=True):
-        super().__init__()
-        self.h  = h
-        self.dh = d // h
-        self.qkv = nn.Linear(d, d * 3, bias=bias)
-        self.qkv_aux = nn.Linear(d, d * 3, bias=bias)
-        self.o  = nn.Linear(d, d, bias=bias)
-        self.g  = nn.Parameter(torch.zeros(h))
-    def split(self, x):
-        b, t, _ = x.shape
-        return x.view(b, t, self.h, self.dh).transpose(1, 2)
-    def merge(self, x):
-        b, h, t, dh = x.shape
-        return x.transpose(1, 2).contiguous().view(b, t, h * dh)
-    def forward(self, x, xa, mask=None):
-        q, k, v   = self.qkv(x).chunk(3, -1)
-        qa, ka, va = self.qkv_aux(xa).chunk(3, -1)
-        q, k, v   = map(self.split, (q, k, v))
-        qa, ka, va = map(self.split, (qa, ka, va))
-        dots      = (q @ k.transpose(-2, -1)) / self.dh**0.5
-        dots_aux  = (q @ ka.transpose(-2, -1)) / self.dh**0.5
-        if mask is not None: dots = dots.masked_fill(mask, -9e15)
-        p   = dots.softmax(-1)
-        pa  = dots_aux.softmax(-1)
-        h_main = p  @ v
-        h_aux  = pa @ va
-        g = torch.sigmoid(self.g).view(1, -1, 1, 1)
-        out = self.merge(h_main * (1 - g) + h_aux * g)
-        return self.o(out)
-class PositionalEncoding(nn.Module):
-    def __init__(self, dims, ctx):
-        super(PositionalEncoding, self).__init__()
-        self.dims = dims
-        self.ctx = ctx
-        self.pe = self.get_positional_encoding(max_ctx=ctx)
-    def get_positional_encoding(self, max_ctx):
-        pe = torch.zeros(max_ctx, self.dims)
-        position = torch.arange(0, max_ctx, dtype=torch.float32).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.dims, 2, dtype=torch.float32)
-            * (-math.log(10000.0) / self.dims)
-        )
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0)
-        return pe.to(device)
-    def forward(self, x):
-        ctx = x.size(1)
-        pe = self.pe[:, :ctx, :]
-        x = x * math.sqrt(self.dims)
-        x = x + pe
-        return x
-class FEncoder(nn.Module):
-    def __init__(self, mels, dims, head, layer, kernel_size, act, stride=1, use_rope=False, spec_shape=None, debug=[]):
-        super().__init__()
-        self.head = head
-        self.head_dim = dims // head
-        self.dropout = 0.01
-        self.use_rope = use_rope
-        self.dims = dims
-        self.debug = debug
-        act_fn = get_activation(act)
-        self.attend_pitch = False
-        if self.attend_pitch:
-            self.q, self.k, self.v, self.o, self.scale = qkv_init(dims, head)
-            self.mlp = nn.Sequential(
-                nn.Linear(dims, dims),
-                nn.ReLU(),
-                nn.Linear(dims, dims),
-            )
-        else:
-            self.q, self.k, self.v, self.o, self.scale = None, None, None, None, None
-            self.mlp = None
-        self.encoder = nn.Sequential(
-            Conv1d(mels, dims, kernel_size=3, stride=1, padding=1), act_fn,
-            Conv1d(dims, dims, kernel_size=3, stride=1, padding=1), act_fn,
-            Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
-        if use_rope:
-            if spec_shape is not None:
-                self.rope = rotary(dims=dims, head=head, radii=False, debug=[], use_pbias=False, axial=False, spec_shape=spec_shape)
-        else:
-            self.rope = None
-            self.positional = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
-        self.norm = RMSNorm(dims)
-    def apply_rope_to_features(self, x, en=None, f=None, layer="audio"):
-        batch, ctx, dims = x.shape
-        x = x.view(batch, ctx, self.head, self.head_dim).permute(0, 2, 1, 3)
-        freqs = self.rope(ctx, en=en, f=f, layer=layer)
-        x = self.rope.apply_rotary(x, freqs)
-        x = x.permute(0, 2, 1, 3).contiguous().view(batch, ctx, dims)
-        return x
-    def forward(self, x: Tensor, en=None, f=None, layer = None):
-        x = self.encoder(x).permute(0, 2, 1)
-        if self.use_rope:
-            x = self.apply_rope_to_features(x, en=en, f=f, layer=layer)
-        else:
-            x = x + self.positional(x.shape[1], x.shape[-1], 10000).to(device, dtype)
-        if self.mlp is not None:
-            x = self.mlp(x)
-        if self.attend_pitch:
-            xa = en["input_ids"]
-            if xa is not None:
-                q, k, v = create_qkv(self.q, self.k, self.v, x=xa, xa=x, head=self.head)
-                out, _ = calculate_attention(q, k, v, mask=None, temperature=1.0, is_causal=True)
-                out = self.o(out)
-                x = x + out
-        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
-        x = self.norm(x)
-        return x
-class WEncoder(nn.Module):
-    def __init__(self, input_dims, dims, head, layer, kernel_size, act, use_rope=False, debug=[], spec_shape=None):
-        super().__init__()
-        self.head = head
-        self.head_dim = dims // head
-        self.dropout = 0.01
-        self.use_rope = use_rope
-        self.dims = dims
-        self.debug = debug
-        act_fn = get_activation(act)
-        self.target_length = None
-        self.encoder = nn.Sequential(
-            Conv1d(input_dims, dims//4, kernel_size=15, stride=4, padding=7), act_fn,
-            Conv1d(dims//4, dims//2, kernel_size=7, stride=2, padding=3), act_fn,
-            Conv1d(dims//2, dims, kernel_size=5, stride=2, padding=2), act_fn)
-        if use_rope:
-            if spec_shape is not None:
-                self.rope = rotary(dims=dims, head=head, radii=False, debug=[], use_pbias=False, axial=False, spec_shape=spec_shape)
-        else:
-            self.rope = None
-            self.positional = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
-        self.norm = RMSNorm(dims)
-    def apply_rope_to_features(self, x, en=None, f=None, layer="audio"):
-        batch, ctx, dims = x.shape
-        x = x.view(batch, ctx, self.head, self.head_dim).permute(0, 2, 1, 3)
-        freqs = self.rope(ctx, en=en, f=f, layer=layer)
-        x = self.rope.apply_rotary(x, freqs)
-        x = x.permute(0, 2, 1, 3).contiguous().view(batch, ctx, dims)
-        return x
-    def forward(self, x: Tensor, en= None, f=None, layer = None):
-        x = self.encoder(x).permute(0, 2, 1)
-        if self.target_length and x.shape[1] != self.target_length:
-            x = F.adaptive_avg_pool1d(x.transpose(1, 2), self.target_length).transpose(1, 2)
-        if self.use_rope:
-            x = self.apply_rope_to_features(x, en=en, f=f, layer=layer)
-        else:
-            x = x + self.positional(x.shape[1], x.shape[-1], 10000).to(device, dtype)
-        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
-        x = self.ln(x)
-        print(f"X: {x.shape} {f}") if "encoder" in self.debug else None
-        return self.norm(x)
-class PEncoder(nn.Module):
-    def __init__(self, input_dims, dims, head, layer, kernel_size, act, use_rope=True, debug=[], one_shot=False, spec_shape=None):
-        super().__init__()
-        self.head = head
-        self.head_dim = dims // head
-        self.dims = dims
-        self.dropout = 0.01
-        self.use_rope = use_rope
-        self.debug = debug
-        act_fn = get_activation(act)
-        self.encoder = nn.Sequential(
-            Conv1d(input_dims, dims, kernel_size=7, stride=1, padding=3), act_fn,
-            Conv1d(dims, dims, kernel_size=5, stride=1, padding=2), act_fn,
-            Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
-        if use_rope:
-                self.rope = rotary(dims=dims, head=head, radii=False, debug=[], use_pbias=False, axial=False, spec_shape=spec_shape)
-        else:
-            self.rope = None
-            self.positional = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
-        self.norm = RMSNorm(dims)
-    def rope_to_feature(self, x, en=None, f="pitch", layer="PEncoder"):
-        batch, ctx, dims = x.shape
-        x = x.view(batch, ctx, self.head, self.head_dim).permute(0, 2, 1, 3)
-        freqs = self.rope(ctx, en=en, f=f, layer=layer)
-        x = self.rope.apply_rotary(x, freqs)
-        x = x.permute(0, 2, 1, 3).contiguous().view(batch, ctx, dims)
-        return x
-    def forward(self, x: Tensor, en= None, f="pitch", layer="PEncoder"):
-        if x.dim() == 2:
-            x = x.unsqueeze(0)
-        x = self.encoder(x).permute(0, 2, 1)
-        if self.use_rope:
-            x = self.rope_to_feature(x, en=en, f=f, layer=layer)
-        else:
-            x = x + self.positional(x.shape[1], x.shape[-1], 10000).to(device, dtype)
-        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
-        x = self.norm(x)
-        print(f"X: {x.shape} {f}") if "PEncoder" in self.debug else None
-        return x
-class theBridge(nn.Module):
-    def __init__(self, vocab: int, mels: int, ctx: int, dims: int, head: int, layer: int,
-                debug: List[str], features: List[str], act: str = "gelu"):
-        super(theBridge, self).__init__()
-        tgate = True
-        mgate = False
-        cgate = False
-        self.debug = debug
-        self.counter = 0
-        self.dropout = 0.01
-        self.features = features
-        self.do_blend = "no_blend" not in self.debug
-        self.sequential = "sequential" in self.debug
-        self.layer = layer
-        self.token = nn.Embedding(vocab, dims, device=device, dtype=dtype)
-        self.positional = nn.Parameter(torch.empty(ctx, dims, device=device, dtype=dtype), requires_grad=True)
-        self.blend = nn.Parameter(torch.tensor(0.5, device=device, dtype=dtype), requires_grad=True)
-        self.norm = RMSNorm(dims)
-        self.sinusoid_pos = lambda length, dims, max_tscale: sinusoids(length, dims, 10000)
-        self.rotary = rotary(dims=dims,  head=head, debug=debug, radii=False)
-        with torch.no_grad():
-            self.token.weight[0].zero_()
-        act_fn = get_activation(act)
-        if features == ["spectrogram", "waveform", "pitch"]:
-            cgate=True
-        else:
-            cgate = False
-        self.blockA = nn.ModuleDict()
-        self.blockA["waveform"] = nn.ModuleList(
-            [WEncoder(input_dims=1, dims=dims, head=head, layer=layer, kernel_size=11, act=act_fn)] +
-            [Residual(ctx=ctx, dims=dims, head=head, act=act_fn, tgate=tgate, mgate=mgate, cgate=cgate, debug=debug, features=features)
-            for _ in range(layer)] if "waveform" in features else None)
-        for feature_type in ["spectrogram", "aperiodic", "harmonic"]:
-            if feature_type in features:
-                self.blockA[feature_type] = nn.ModuleList(
-                    [FEncoder(mels=mels, dims=dims, head=head, layer=layer, kernel_size=3, act=act_fn)] +
-                    [Residual(ctx=ctx, dims=dims, head=head, act=act_fn, tgate=tgate, mgate=mgate, cgate=cgate, debug=debug, features=features) for _ in range(layer)] if feature_type in features else None)
-            else:
-                self.blockA[feature_type] = None
-        for feature_type in ["pitch", "phase"]:
-            if feature_type in features:
-                self.blockA[feature_type] = nn.ModuleList(
-                    [PEncoder(input_dims=1, dims=dims, head=head, layer=layer, kernel_size=9, act=act_fn)] +
-                    [Residual(ctx=ctx, dims=dims, head=head, act=act_fn, tgate=tgate, mgate=mgate, cgate=cgate, debug=debug, features=features) for _ in range(layer)] if feature_type in features else None)
-            else:
-                self.blockA[feature_type] = None
-        self.blockB = nn.ModuleList([
-            Residual(ctx=ctx, dims=dims, head=head, act=act_fn, tgate=tgate, mgate=mgate, cgate=cgate, debug=debug, features=features)
-            for _ in range(layer)])
-        self.modal = nn.ModuleList([
-            Residual(ctx=ctx, dims=dims, head=head, act=act_fn, tgate=tgate, mgate=mgate, cgate=cgate, debug=debug, features=features)
-            for _ in range(layer)])
-        mask = torch.tril(torch.ones(ctx, ctx), diagonal=0)
-        self.register_buffer("mask", mask, persistent=False)
-        self.norm = RMSNorm(dims)
-    def forward(self, x, xa, en, f, sequential=False) -> Tensor:
-        mask = self.mask[:x.shape[1], :x.shape[1]]
-        x = self.token(x.long()) + self.positional[:x.shape[1]]
-        out = {}
-        out["input_ids"] = x
-        out.update(en)
-        for b in chain(self.blockA[f] or []):
-            xa = b(x=xa, en=out, f=f, layer="en")
-        for b in chain(self.blockB or []):
-            x = b(x=x, xa=None, mask=mask, en=out, f=f, layer="dec")
-            y = b(x, xa=xa, mask=None, en=out, f=f, layer="cross")
-            if sequential:
-                x = y
-            else:
-                a = torch.sigmoid(self.blend)
-                x = a * y + (1 - a) * x
-        for b in self.modal:
-            xc = b(x=torch.cat([x, xa], dim=1), xa=None, mask=None, en=out, f=f, layer="modal")
-            xm = b(x=xc[:, :x.shape[1]], xa=xc[:, x.shape[1]:], mask=None, en=out, f=f, layer="modal")
-            if sequential:
-                x = xm
-            else:
-                a = torch.sigmoid(self.blend)
-                x = a * x + (1 - a) * xm
-        if self.counter < 1 and "encoder" in self.debug:
-                shapes = {k: v.shape for k, v in en.items()}
-                print(f"Step {self.counter}: mode: {list(en.keys()) }: shapes: {shapes}")
-        self.counter += 1
-        x = self.norm(x)
-        x = x @ torch.transpose(self.token.weight.to(dtype), 0, 1).float()
-        return x
-class Echo(nn.Module):
-    def __init__(self, param: Dimensions):
-        super().__init__()
-        self.param = param
-        self.processor = theBridge(
-            vocab=param.vocab,
-            mels=param.mels,
-            ctx=param.ctx,
-            dims=param.dims,
-            head=param.head,
-            layer=param.layer,
-            features=param.features,
-            act=param.act,
-            debug=param.debug,
-            )
-    def forward(self,
-        labels=None,
-        input_ids=None,
-        waveform: Optional[torch.Tensor]=None,
-        spectrogram: Optional[torch.Tensor]=None,
-        pitch: Optional[torch.Tensor]=None,
-        f0: Optional[torch.Tensor]=None,
-        f0t: Optional[torch.Tensor]=None,
-        harmonic: Optional[torch.Tensor]=None,
-        aperiodic: Optional[torch.Tensor]=None,
-        phase: Optional[torch.Tensor]=None,
-        ) -> Dict[str, Optional[torch.Tensor]]:
-        en= TensorDict(batch_size=[1], device=self.device, dtype=self.dtype)
-        en= {}
-        if f0 is not None:
-            en["f0"] = f0
-        if f0t is not None:
-            en["f0t"] = f0t
-        if harmonic is not None:
-            en["harmonic"] = harmonic
-        if aperiodic is not None:
-            en["aperiodic"] = aperiodic
-        if phase is not None:
-            en["phase"] = phase
-        if pitch is not None:
-            en["pitch"] = pitch
-        if waveform is not None:
-            en["waveform"] = waveform
-        if spectrogram is not None:
-            en["spectrogram"] = spectrogram
-        x = input_ids
-        for f, xa in en.items():
-            logits = self.processor(x, xa, en, f)
-        loss = None
-        if labels is not None:
-            loss = F.cross_entropy(
-                logits.view(-1, logits.shape[-1]), labels.view(-1), ignore_index=0)
-        return {"logits": logits, "loss": loss}
-    @property
-    def device(self):
-        return next(self.parameters()).device
-    @property
-    def dtype(self):
-        return next(self.parameters()).dtype
-```


296
297	** this model deviates in a lot of ways from standard transformer models.
298