Sin2pi
/

asr-model

@@ -8,10 +8,15 @@ import torchaudio
 import torch.nn.functional as F
 import torch.nn.init as init
 from torch import nn, Tensor
-from datasets import load_dataset, Audio
-from torch.utils.data import Dataset, DataLoader, random_split
 import numpy as np
-from typing import Optional, Dict, Union, List, Tuple
 import transformers
 from dataclasses import dataclass
 from opimizer import MaxFactor
@@ -20,22 +25,44 @@ torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.set_float32_matmul_precision('high')
 transformers.utils.logging.set_verbosity_error()
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 logging.basicConfig(level=logging.ERROR)
 @dataclass
 class Dimensions:
     vocab: int
-    text_ctx: int
-    text_dims: int
-    text_head: int
-    text_idx: int
     mels: int
-    aud_ctx: int
-    aud_dims: int
-    aud_head: int
-    aud_idx: int
     act: str
     debug: List[str]
     cross_attn: bool
@@ -59,6 +86,132 @@ def get_generation_config(param):
         use_cache=False,
         return_timestamps=False)
 def dict_to(d, device, dtype=dtype):
     return {k: v.to(device, dtype) if isinstance(v, torch.Tensor) else v
             for k, v in d.items()}
@@ -100,17 +253,17 @@ class RMSNorm(nn.Module):
         self.eps = eps
         self.elementwise_affine = elementwise_affine
         if self.elementwise_affine:
-            self.weight = nn.Parameter(torch.empty(self.normalized_shape))
             init.ones_(self.weight)
         else:
             self.register_parameter("weight", None)
     def forward(self, x):
-        return F.rms_norm(x, self.normalized_shape, self.weight, self.eps)
 def LayerNorm(x: Tensor, normalized_shape: Union[int, Tensor, List, Tuple],
                weight: Optional[Tensor] = None, bias: Optional[Tensor] = None,
                eps: float = 1e-5) -> Tensor:
-    return F.layer_norm(x, normalized_shape, weight, bias, eps)
 def get_device():
     return torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -128,9 +281,8 @@ def sinusoids(length, channels, max_tscale=10000):
     scaled_t = torch.arange(length)[:, np.newaxis] * inv_tscales[np.newaxis, :]
     return torch.cat([torch.sin(scaled_t), torch.cos(scaled_t)], dim=1)
 class rotary(nn.Module):
-    def __init__(self, dims, head, max_ctx=1500, theta=10000, radii=True, debug: List[str] = [], use_pbias=False):
         super(rotary, self).__init__()
         self.use_pbias = use_pbias
@@ -143,96 +295,17 @@ class rotary(nn.Module):
         self.counter = 0
         self.last_theta = None
-        self.bias = nn.Parameter(torch.zeros(max_ctx, dims // 2))
-        self.theta = nn.Parameter(torch.tensor(theta, device=device, dtype=dtype), requires_grad=True)
-    # def theta_freqs(self, theta):
-    #     freq = (theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
-    #     freqs = nn.Parameter(torch.tensor(freq, device=device, dtype=dtype), requires_grad=True)
-    #     return freqs
-    # def mel_geodesic_rotary(f0, theta):
-    #     mel_f0 = 1127.0 * torch.log(1.0 + f0 / 700.0)
-    #     fisher_info = torch.var(mel_f0) + 1e-8
-    #     adaptive_theta = theta * torch.sqrt(fisher_info)
-    #     freqs = self.theta_freqs(adaptive_theta)
-    #     return freqs
-    # def compute_pitch_fisher_info(f0, window_size=10):
-    #     if f0.dim() == 1:
-    #         f0 = f0.unsqueeze(0)
-    #     mel_f0 = 1127.0 * torch.log(1.0 + f0 / 700.0)
-    #     fisher_info = torch.nn.functional.avg_pool1d(
-    #         mel_f0.unsqueeze(0),
-    #         kernel_size=window_size,
-    #         stride=1,
-    #         padding=window_size//2
-    #     ).squeeze(0)
-    #     fisher_info = (fisher_info - fisher_info.min()) / (fisher_info.max() - fisher_info.min() + 1e-8)
-    #     return fisher_info
-    # def compute_advanced_fisher_info(f0, window_size=10):
-    #     mel_f0 = 1127.0 * torch.log(1.0 + f0 / 700.0)
-    #     local_mean = torch.nn.functional.avg_pool1d(
-    #         mel_f0.unsqueeze(0), window_size, 1, window_size//2
-    #     ).squeeze(0)
-    #     local_var = torch.nn.functional.avg_pool1d(
-    #         (mel_f0 - local_mean).pow(2).unsqueeze(0),
-    #         window_size, 1, window_size//2
-    #     ).squeeze(0)
-    #     fisher_info = 1.0 / (local_var + 1e-8)
-    #     return fisher_info
-    # def test_fisher_info(self, f0):
-    #     """Test Fisher information computation."""    #     fisher_info = self.compute_pitch_fisher_info(f0)
-    #     print(f"f0 range: {f0.min():.1f} - {f0.max():.1f}")
-    #     print(f"Fisher info range: {fisher_info.min():.3f} - {fisher_info.max():.3f}")
-    #     print(f"Fisher info mean: {fisher_info.mean():.3f}")
-    #     # Visualize: high Fisher info = meaningful pitch changes
-    #     return fisher_info
-    # def forward(self, x=None, enc=None, layer=None, feature_type="audio"):
-    #     if f0 is not None:
-    #         # Compute Fisher information
-    #         fisher_info = self.compute_pitch_fisher_info(f0)
-    #         # Use Fisher info to weight pitch influence
-    #         f0_weighted = f0 * fisher_info
-    #         # Apply to both theta and radius
-    #         f0_mean = f0_weighted.mean()
-    #         theta = f0_mean + self.theta
-    #         if self.radii:
-    #             radius = f0_weighted.to(device, dtype)
-    def theta_freqs(self, theta):
-        freq = (theta / 220.0) * 700 * (torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)), self.dim // 2, device=device, dtype=dtype) / 2595) - 1) / 1000
-        freqs = nn.Parameter(torch.tensor(freq, device=device, dtype=dtype), requires_grad=True)
-        return freqs
-    def mel_scale_scalar(freq: float) -> float:
         return 1127.0 * math.log(1.0 + freq / 700.0)
-    def mel_scale(freq: Tensor) -> Tensor:
         return 1127.0 * (1.0 + freq / 700.0).log()
-    def return_f0(self, f0=None):
-        if f0 is not None:
-            self.f0 = f0
-            self.update_base(f0)
-            return f0.squeeze(0).to(device, dtype)
-        elif hasattr(self, 'f0') and self.f0 is not None:
-            return self.f0.squeeze(0).to(device, dtype)
-        return None
     def pitch_bias(self, f0):
         if f0 is None:
             return None
@@ -242,9 +315,31 @@ class rotary(nn.Module):
                                     f0_norm.unsqueeze(1)))
         return f0_sim.unsqueeze(0).unsqueeze(0)
     def forward(self, x=None, enc=None, layer=None, feature_type="audio") -> Tensor:
         f0 = enc.get("f0") if enc is not None else None
         if isinstance(x, int):
             ctx = x
         elif isinstance(x, torch.Tensor) and x.ndim == 2:
@@ -252,46 +347,33 @@ class rotary(nn.Module):
         elif isinstance(x, torch.Tensor) and x.ndim == 3:
             batch, ctx, dims = x.shape
         else:
-            batch, head, ctx, head_dim = x.shape
-        t = torch.arange(ctx, device=device, dtype=dtype)
-        if f0 is not None and f0.dim() == 2:
-            if f0.shape[0] == 1:
-                f0 = f0.squeeze(0)
-            else:
-                f0 = f0.view(-1)
         if f0 is not None:
-            f0_mean = f0.mean()
-            theta = f0_mean + self.theta
         else:
             theta = self.theta
         freqs = self.theta_freqs(theta)
-        freqs = t[:, None] * freqs[None, :]
         if self.radii and f0 is not None:
             radius = f0.to(device, dtype)
-            L = radius.shape[0]
-            if L != ctx:
-                F = L / ctx
-                idx = torch.arange(ctx, device=f0.device)
-                idx = (idx * F).long().clamp(0, L - 1)
-                radius = radius[idx]
-            freqs = torch.polar(radius.unsqueeze(-1).expand_as(freqs), freqs)
         else:
-            freqs = torch.polar(torch.ones_like(freqs), freqs)
-        if "radius" in self.debug and self.counter % 100 == 0:
-            theta_value = theta.item() if isinstance(theta, torch.Tensor) else theta
-            print(f"  [{layer}] [Radius] {radius.shape} {radius.mean():.2f} [Theta] {theta_value:.2f} [f0] {f0.shape if f0 is not None else None} [Freqs] {freqs.shape} {freqs.mean():.2f} [ctx] {ctx}")
-        if "theta" in self.debug and self.counter % 100 == 0:
-            if self.last_theta is None or abs(self.last_theta - theta.item()) > 1.0:
-                self.last_theta = theta.item()
-                print(f"[Theta] {self.last_theta:.2f}")
         self.counter += 1
         return freqs.unsqueeze(0)
@@ -308,12 +390,11 @@ class rotary(nn.Module):
         x1 = x1.view(orig_shape)
         return torch.cat([x1.type_as(x), x2], dim=-1)
 class MultiheadA(nn.Module):
-    _seen = set()
     rbf = False
     def __init__(self, dims: int, head: int, rotary_emb: bool = True,
-                 zero_val: float = 1e-4, minz: float = 1e-6, maxz: float = 1e-3, debug: List[str] = [], optim_attn=False, use_pbias=False):
         super(MultiheadA, self).__init__()
         self.dims = dims
@@ -345,8 +426,29 @@ class MultiheadA(nn.Module):
                 )
         else:
             self.rope = None
-    def forward(self, x: Tensor, xa: Tensor = None, mask: Tensor = None, enc = None, layer = None, feature_type="audio", need_weights=True) -> tuple:
         x = x.to(device, dtype)
         if xa is not None:
@@ -365,8 +467,8 @@ class MultiheadA(nn.Module):
             q2 = q.shape[2]
             k2 = k.shape[2]
-            q = self.rope.apply_rotary(q, (self.rope(x=q2, enc=enc, layer=layer)))
-            k = self.rope.apply_rotary(k, (self.rope(x=k2, enc=enc, layer=layer)))
         else:
             q = q.view(*q.shape[:2], self.head, -1).permute(0, 2, 1, 3)
             k = k.view(*k.shape[:2], self.head, -1).permute(0, 2, 1, 3)
@@ -374,14 +476,25 @@ class MultiheadA(nn.Module):
         qk = (q * scale) @ (k * scale).transpose(-1, -2)
         token_ids = k[:, :, :, 0]
         zscale = torch.ones_like(token_ids)
         fzero = torch.clamp(F.softplus(self.fzero), self.minz, self.maxz)
         zscale[token_ids.float() == self.pad_token] = fzero
         if mask is not None:
-            mask = mask[:q2, :q2]
-            qk = qk + mask.unsqueeze(0).unsqueeze(0) * zscale.unsqueeze(-2).expand(qk.shape)
         qk = qk * zscale.unsqueeze(-2)
         w = F.softmax(qk, dim=-1).to(q.dtype)
         wv = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
@@ -392,8 +505,9 @@ class MultiheadA(nn.Module):
         return self.o(wv), qk
 class t_gate(nn.Module):
-    def __init__(self, dims, num_types=4):
         super().__init__()
         self.gate_projections = nn.ModuleList([
             nn.Sequential(Linear(dims, 1), nn.Sigmoid())
             for _ in range(num_types)])
@@ -401,19 +515,25 @@ class t_gate(nn.Module):
             Linear(dims, num_types),
             nn.Softmax(dim=-1))
     def forward(self, x):
         type_probs = self.type_classifier(x)
         gates = torch.stack([gate(x) for gate in self.gate_projections], dim=-1)
         comb_gate = torch.sum(gates * type_probs.unsqueeze(2), dim=-1)
         return comb_gate
 class m_gate(nn.Module):
-    def __init__(self, dims, mem_size=64):
         super().__init__()
-        self.m_key = nn.Parameter(torch.randn(mem_size, dims))
-        self.m_val = nn.Parameter(torch.randn(mem_size, 1))
-        self.gate_proj = nn.Sequential(Linear(dims, dims//2), nn.SiLU(), Linear(dims//2, 1))
     def forward(self, x):
         d_gate = torch.sigmoid(self.gate_proj(x))
         attention = torch.matmul(x, self.m_key.transpose(0, 1))
         attention = F.softmax(attention / math.sqrt(x.shape[-1]), dim=-1)
@@ -422,16 +542,20 @@ class m_gate(nn.Module):
         return 0.5 * (d_gate + m_gate)
 class c_gate(nn.Module):
-    def __init__(self, dims):
         super().__init__()
-        self.s_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-        self.w_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-        self.p_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-        self.e_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-        self.ph_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-        self.integ = Linear(dims*5, dims)
     def forward(self, x, features):
         s_feat = features.get("spectrogram", x)
         w_feat = features.get("waveform", x)
         p_feat = features.get("pitch", x)
@@ -445,9 +569,21 @@ class c_gate(nn.Module):
         comb = torch.cat([s, w, p, e, ph], dim=-1)
         return self.integ(comb)
 class Residual(nn.Module):
     _seen = set()
-    def __init__(self, ctx, dims, head, act, cross_attn=True, debug: List[str] = [],
                  tgate=True, mgate=False, cgate=False, mem_size=512, features=None):
         super().__init__()
@@ -455,80 +591,44 @@ class Residual(nn.Module):
         self.head = head
         self.ctx = ctx
         self.head_dim = dims // head
-        self.cross_attn = cross_attn
         self.features = features
         self.debug = debug
         self.counter = 0
         self.dropout = 0.01
-        self.t_gate = tgate
-        self.m_gate = mgate
-        self.c_gate = cgate
-        self.do_blend = "no_blend" not in self.debug
         self.blend = nn.Parameter(torch.tensor(0.5))
-        self.skip_gates = True if "skip_gates" in self.debug else False
-        act_map = {"gelu": nn.GELU(), "relu": nn.ReLU(), "sigmoid": nn.Sigmoid(),
-                  "tanh": nn.Tanh(), "swish": nn.SiLU(), "tanhshrink": nn.Tanhshrink(),
-                  "softplus": nn.Softplus(), "softshrink": nn.Softshrink(),
-                  "leaky_relu": nn.LeakyReLU(), "elu": nn.ELU()}
-        act_fn = act_map.get(act, nn.GELU())
-        self.attna = MultiheadA(dims, head, rotary_emb=True, debug=debug)
-        self.attnb = (MultiheadA(dims, head, rotary_emb=True, debug=debug) if cross_attn else None)
         mlp = dims * 4
         self.mlp = nn.Sequential(Linear(dims, mlp), act_fn, Linear(mlp, dims))
-        self.t_gate = t_gate(dims=dims, num_types=4) if tgate else None
-        self.m_gate = m_gate(dims=dims, mem_size=mem_size) if mgate else None
-        self.c_gate = c_gate(dims=dims) if cgate else None
         self.lna = RMSNorm(dims)
-        self.lnb = RMSNorm(dims) if cross_attn else None
         self.lnc = RMSNorm(dims)
-        if not any([t_gate, m_gate, c_gate]):
-            self.mlp_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
     def forward(self, x, xa=None, mask=None, enc=None, layer=None, feature_type="audio") -> Tensor:
-        x = x + self.attna(self.lna(x), xa=None, mask=mask, enc=enc, layer=layer)[0]
-        xb = x
-        if self.attnb and xa is not None:
-            x = x + self.attnb(self.lnb(x), xa=xa, mask=None, enc=enc, layer=layer)[0]
-            if self.do_blend:
-                b = torch.sigmoid(self.blend)
-                x = b * xb + (1 - b) * x
-        if self.skip_gates:
-            x = x + self.mlp(self.lnc(x))
-        else:
-            normx = self.lnc(x)
-            mlp_out = self.mlp(normx)
-            if self.t_gate:
-                gate = self.t_gate(normx)
-                x = x + gate * mlp_out
-            elif self.m_gate:
-                gate = self.m_gate(normx)
-                x = x + gate * mlp_out
-            elif self.c_gate:
-                gate_output = self.c_gate(normx, self.features)
-                x = x + gate_output
-            else:
-                if hasattr(self, 'mlp_gate'):
-                    mlp_gate = self.mlp_gate(normx)
-                    x = x + mlp_gate * mlp_out
-                else:
-                    x = x + mlp_out
-        return x
 class FEncoder(nn.Module):
     def __init__(self, input_dims, dims, head, layer, kernel_size, act, stride=1, use_rope=False, spec_shape=None):
         super().__init__()
@@ -539,8 +639,7 @@ class FEncoder(nn.Module):
         self.use_rope = use_rope
         self.dims = dims
-        act_map = {"gelu": nn.GELU(), "relu": nn.ReLU(), "sigmoid": nn.Sigmoid(), "tanh": nn.Tanh(), "swish": nn.SiLU(), "tanhshrink": nn.Tanhshrink(), "softplus": nn.Softplus(), "softshrink": nn.Softshrink(), "leaky_relu": nn.LeakyReLU(), "elu": nn.ELU()}
-        act_fn = act_map.get(act, nn.GELU())
         self.encoder = nn.Sequential(
             Conv1d(input_dims, dims, kernel_size=kernel_size, stride=stride, padding=kernel_size//2), act_fn,
@@ -551,11 +650,13 @@ class FEncoder(nn.Module):
             if spec_shape is not None:
                 self.rope = rotary(
                     dims=self.head_dim,
                     use_2d_axial=True,
                     spec_shape=spec_shape, debug=[])
             else:
                 self.rope = rotary(
                     dims=self.head_dim,
                     use_2d_axial=False, debug=[])
         else:
             self.rope = None
@@ -569,7 +670,7 @@ class FEncoder(nn.Module):
             feature_type = "spectrogram"
         batch, ctx, dims = x.shape
         x = x.view(batch, ctx, self.head, self.head_dim).permute(0, 2, 1, 3)
-        if feature_type == "spectrogram" and hasattr(self.rope, 'use_2d_axial') and self.rope.use_2d_axial:
             rope_freqs = self.rope(ctx, layer=layer, input_type="spectrogram")
         else:
             rope_freqs = self.rope(ctx, layer=layer, input_type="audio")
@@ -597,8 +698,7 @@ class WEncoder(nn.Module):
         self.use_rope = use_rope
         self.dims = dims
-        act_map = {"gelu": nn.GELU(), "relu": nn.ReLU(), "sigmoid": nn.Sigmoid(), "tanh": nn.Tanh(), "swish": nn.SiLU(), "tanhshrink": nn.Tanhshrink(), "softplus": nn.Softplus(), "softshrink": nn.Softshrink(), "leaky_relu": nn.LeakyReLU(), "elu": nn.ELU()}
-        act_fn = act_map.get(act, nn.GELU())
         self.downsample = nn.Sequential(
             Conv1d(input_dims, dims//8, kernel_size=15, stride=8, padding=7), act_fn,
@@ -611,8 +711,8 @@ class WEncoder(nn.Module):
         if use_rope:
             self.rope = rotary(
                 dims=self.head_dim,
-                use_2d_axial=False,
-                theta=50.0, debug=[])
         else:
             self.rope = None
             self.positional = lambda length: sinusoids(length, dims)
@@ -649,8 +749,7 @@ class PEncoder(nn.Module):
         self.use_rope = use_rope
         self.dims = dims
-        act_map = {"gelu": nn.GELU(), "relu": nn.ReLU(), "sigmoid": nn.Sigmoid(), "tanh": nn.Tanh(), "swish": nn.SiLU(), "tanhshrink": nn.Tanhshrink(), "softplus": nn.Softplus(), "softshrink": nn.Softshrink(), "leaky_relu": nn.LeakyReLU(), "elu": nn.ELU()}
-        act_fn = act_map.get(act, nn.GELU())
         self.encoder = nn.Sequential(
             Conv1d(input_dims, dims//4, kernel_size=7, stride=8, padding=3), act_fn,
@@ -660,8 +759,8 @@ class PEncoder(nn.Module):
         if use_rope:
             self.rope = rotary(
                 dims=self.head_dim,
-                use_2d_axial=False,
-                theta=100.0, debug=[])
         else:
             self.rope = None
             self.positional = lambda length: sinusoids(length, dims)
@@ -687,10 +786,10 @@ class PEncoder(nn.Module):
         x = self.norm(x)
         return x
-class AudioEncoder(nn.Module):
     _seen = set()
-    def __init__(self, mels: int, ctx: int, dims: int, head: int, layer: int, debug: List[str], features: List[str], act: str = "gelu"):
-        super(AudioEncoder, self).__init__()
         self.dims = dims
         self.head = head
@@ -700,9 +799,12 @@ class AudioEncoder(nn.Module):
         self.counter = 0
         self.features = features
         self.dropout = 0.01
-        act_map = {"gelu": nn.GELU(), "relu": nn.ReLU(), "sigmoid": nn.Sigmoid(), "tanh": nn.Tanh(), "swish": nn.SiLU(),"tanhshrink": nn.Tanhshrink(), "softplus": nn.Softplus(), "softshrink": nn.Softshrink(), "leaky_relu": nn.LeakyReLU(), "elu": nn.ELU()}
-        act_fn = act_map.get(act, nn.GELU())
         if features == ["spectrogram", "waveform", "pitch"]:
             cgate=True
@@ -737,80 +839,55 @@ class AudioEncoder(nn.Module):
             if "phase" in features else None),
             })
-    def forward(self, enc, layer="encoder"):
-        enc = dict_to(enc, device, dtype)
-        out = {}
-        out.update(enc)
-        for f in self.features:
-            if f in enc and f in self.blocks:
-                x = enc[f]
-                for block in self.blocks[f]:
-                    x = block(x, enc=enc, layer=layer)
-                out[f] = x
-        return out
-class TextDecoder(nn.Module):
-    def __init__(self, vocab: int, ctx: int, dims: int, head: int, layer: int, cross_attn: bool,
-                debug: List[str], features: List[str]):
-        super(TextDecoder, self).__init__()
-        self.ctx = ctx
-        self.dims = dims
-        self.head = head
-        self.head_dim = dims // head
-        self.debug = debug
-        self.counter = 0
-        self.dropout = 0.01
-        self.features = features
-        self.do_blend = "no_blend" not in self.debug
-        self.sequential = "sequential" in self.debug
-        self.token = nn.Embedding(num_embeddings=vocab, embedding_dim=dims)
-        with torch.no_grad():
-            self.token.weight[0].zero_()
-        self.positional = nn.Parameter(data=torch.empty(ctx, dims), requires_grad=True)
         self.block = nn.ModuleList([
-            Residual(ctx=ctx, dims=dims, head=head, act="gelu", cross_attn=cross_attn, debug=debug, features=features)
-            for _ in range(layer)])
-        self.blocks = nn.ModuleDict({
-        f: nn.ModuleList([Residual(ctx=ctx, dims=dims, head=head, act="gelu", cross_attn=cross_attn, debug=debug, features=features)
-            for _ in range(layer)]) for f in features})
-        self.blend = nn.ParameterDict({f: nn.Parameter(torch.tensor(0.5)) for f in features})
         self.ln_dec = RMSNorm(dims)
         mask = torch.tril(torch.ones(ctx, ctx), diagonal=0)
         self.register_buffer("mask", mask, persistent=False)
-    def forward(self, x, enc, order=None, layer='decoder') -> Tensor:
-        if order is None:
-            order = self.features
-        mask = self.mask[:x.shape[1], :x.shape[1]]
         x = self.token(x) + self.positional[:x.shape[1]]
         x = F.dropout(x, p=self.dropout, training=self.training)
         for block in self.block:
             x = block(x, xa=None, mask=mask, enc=None, layer=layer)
-        for f in order:
             if f in enc:
-                xa = enc[f]
-                for block in self.blocks[f]:
-                    out = block(x=x, xa=xa, mask=None, enc=None, layer=layer)
                 if self.sequential:
                     x = out
                 else:
-                    a = torch.sigmoid(self.blend[f])
                     x = a * out + (1 - a) * x
         x = self.ln_dec(x)
         return x @ torch.transpose(self.token.weight.to(dtype), 0, 1).float()
@@ -820,38 +897,28 @@ class Echo(nn.Module):
         super().__init__()
         self.param = param
-        self.encoder = AudioEncoder(
             mels=param.mels,
-            ctx=param.aud_ctx,
-            dims=param.aud_dims,
-            head=param.aud_head,
-            layer=param.aud_idx,
-            act=param.act,
             debug=param.debug,
             features=param.features,
             )
-        self.decoder = TextDecoder(
-            vocab=param.vocab,
-            ctx=param.text_ctx,
-            dims=param.text_dims,
-            head=param.text_head,
-            layer=param.text_idx,
-            cross_attn=param.cross_attn,
-            debug=param.debug,
-            features=param.features,
-            )
     def forward(self,
         labels=None,
-        waveform: Optional[torch.Tensor]=None,
         input_ids=None,
-        spectrogram: torch.Tensor=None,
         pitch: Optional[torch.Tensor]=None,
         f0: Optional[torch.Tensor]=None,
         envelope: Optional[torch.Tensor]=None,
         phase: Optional[torch.Tensor]=None,
-        ) -> Dict[str, torch.Tensor]:
         encoder_inputs = {}
         if spectrogram is not None:
@@ -866,9 +933,10 @@ class Echo(nn.Module):
             encoder_inputs["phase"] = phase
         if f0 is not None:
             encoder_inputs["f0"] = f0
-        encoder_outputs = self.encoder(encoder_inputs)
-        logits = self.decoder(input_ids, encoder_outputs)
         loss = None
         if labels is not None:
@@ -888,7 +956,7 @@ class Echo(nn.Module):
         std = 0.02
         self.init_counts = {
             "Linear": 0, "Conv1d": 0, "LayerNorm": 0, "RMSNorm": 0,
-            "Conv2d": 0, "SEBlock": 0, "TextDecoder": 0, "AudioEncoder": 0,
             "Residual": 0, "MultiheadA": 0, "MultiheadB - Cross Attention": 0,
             "MultiheadC": 0, "MultiheadD": 0, "FEncoder": 0,
             "WEncoder": 0, "PEncoder": 0}
@@ -914,12 +982,9 @@ class Echo(nn.Module):
                     nn.init.zeros_(module.bias)
                 self.init_counts["Conv2d"] += 1
             elif isinstance(module, MultiheadA):
                 self.init_counts["MultiheadA"] += 1
-            elif isinstance(module, TextDecoder):
-                self.init_counts["TextDecoder"] += 1
-            elif isinstance(module, AudioEncoder):
-                self.init_counts["AudioEncoder"] += 1
             elif isinstance(module, Residual):
                 self.init_counts["Residual"] += 1
@@ -957,10 +1022,11 @@ class Echo(nn.Module):
             encoder_inputs["phase"] = phase
         if f0 is not None:
             encoder_inputs["f0"] = f0
-        encoder_outputs = self.encoder(encoder_inputs)
         for i in range(max_length - 1):
             with torch.no_grad():
-                logits = self.decoder(ids, encoder_outputs)
             next_token_logits = logits[:, -1, :]
             if i < min_length:
                 next_token_logits[:, eos_token_id] = 0
@@ -985,10 +1051,9 @@ class Echo(nn.Module):
                 })
         return Config()
-def setup_tokenizer(token: str, local_tokenizer_path: str = "./"):
     from tokenizers import Tokenizer
-    tokenizer = Tokenizer.from_file(f"{local_tokenizer_path}/tokenizer.json")
     orig_encode = tokenizer.encode
     def enc(text, add_special_tokens=True):
         ids = orig_encode(text).ids
@@ -1005,6 +1070,11 @@ def setup_tokenizer(token: str, local_tokenizer_path: str = "./"):
                     ids = ids[1:]
                 while ids and ids[-1] in [0, 2]:
                     ids = ids[:-1]
             results.append(tokenizer.decode(ids))
         return results
@@ -1019,95 +1089,165 @@ def setup_tokenizer(token: str, local_tokenizer_path: str = "./"):
     tokenizer.eos_token_id = 2
     return tokenizer
-def extract_features(batch, tokenizer, sample_rate=16000, n_mels=128, n_fft=1024, hop_length=256):
     audio = batch["audio"]
-    waveform = torch.tensor(audio["array"]).float()
-    if waveform.dim() == 2:
-        waveform = waveform.mean(dim=0)
-    # mel_spectrogram = transform(wav)
-    # log_mel = torch.clamp(mel_spectrogram, min=1e-10).log10()
-    # log_mel = torch.maximum(log_mel, log_mel.max() - 8.0)
-    # spec = (log_mel + 4.0) / 4.0
-    # spec = torch.tensor(spec)
-    mel = torchaudio.transforms.MelSpectrogram(
-        sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
-    )
-    spec = mel(waveform)
-    spec = torch.clamp(spec, min=1e-10).log10()
-    spec = torch.tensor(spec) if not isinstance(spec, torch.Tensor) else spec
-    wav_np = waveform.numpy().astype(np.float64)
     f0, t = pw.dio(wav_np, sample_rate, frame_period=hop_length/sample_rate*1000)
     f0 = pw.stonemask(wav_np, f0, t, sample_rate)
-    f0 = torch.from_numpy(f0).float()
-    transcription = batch.get("sentence", batch.get("transcription", ""))
-    input_ids = tokenizer.encode(transcription)
     return {
         "spectrogram": spec,
         "f0": f0,
-        "input_ids": input_ids,
-        "labels": input_ids,
     }
-def prepare_datasets(tokenizer, token: str, sample_rate=16000, n_mels=128, n_fft=1024, hop_length=256):
-    raw_train = load_dataset(
-        "google/fleurs", "en_us", token=token, split="train[:1000]", trust_remote_code=True
-    )
-    raw_test = load_dataset(
-        "google/fleurs", "en_us", token=token, split="test[:100]", trust_remote_code=True
-    )
-    raw_train = raw_train.cast_column("audio", Audio(sampling_rate=sample_rate))
-    raw_test = raw_test.cast_column("audio", Audio(sampling_rate=sample_rate))
-    train_dataset = raw_train.map(
-        lambda x: extract_features(x, tokenizer, sample_rate, n_mels, n_fft, hop_length),
-        remove_columns=raw_train.column_names
-    )
-    test_dataset = raw_test.map(
-        lambda x: extract_features(x, tokenizer, sample_rate, n_mels, n_fft, hop_length),
-        remove_columns=raw_test.column_names
-    )
-    return train_dataset, test_dataset
 @dataclass
 class DataCollator:
     tokenizer: Any
     def __call__(self, features: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
         pad_token_id = getattr(self.tokenizer, 'pad_token_id', 0)
         bos_token_id = getattr(self.tokenizer, 'bos_token_id', 1)
         eos_token_id = getattr(self.tokenizer, 'eos_token_id', 2)
-        # Gather and pad spectrograms and f0
-        specs = [f["spectrogram"] for f in features]
-        f0s = [f["f0"] for f in features]
-        specs = [torch.tensor(s) if not isinstance(s, torch.Tensor) else s for s in specs]
-        f0s = [torch.tensor(f0) if not isinstance(f0, torch.Tensor) else f0 for f0 in f0s]
-        max_spec_len = max(s.shape[-1] for s in specs)
-        max_f0_len = max(f0.shape[-1] for f0 in f0s)
-        padded_specs = torch.stack([
-            torch.nn.functional.pad(s, (0, max_spec_len - s.shape[-1])) for s in specs
-        ])
-        padded_f0s = torch.stack([
-            torch.nn.functional.pad(f0, (0, max_f0_len - f0.shape[-1])) for f0 in f0s
-        ])
-        input_ids_list = [f["input_ids"] for f in features]
-        # Ensure all are lists, not tensors
-        input_ids_list = [ids.tolist() if isinstance(ids, torch.Tensor) else ids for ids in input_ids_list]
-        max_len = max(len(ids) for ids in input_ids_list)
-        # Add BOS to input_ids, EOS to labels, pad both to max_len+1
-        input_ids = [[bos_token_id] + ids + [pad_token_id] * (max_len - len(ids)) for ids in input_ids_list]
-        labels = [ids + [eos_token_id] + [pad_token_id] * (max_len - len(ids)) for ids in input_ids_list]
-        input_ids = torch.tensor(input_ids, dtype=torch.long)
-        labels = torch.tensor(labels, dtype=torch.long)
-        return {
-            "spectrogram": padded_specs,
-            "f0": padded_f0s,
-            "input_ids": input_ids,
-            "labels": labels,
-        }
 def levenshtein(reference_words, hypothesis_words):
     m, n = len(reference_words), len(hypothesis_words)
@@ -1137,7 +1277,7 @@ def wer_batch(references, hypotheses):
         total_words += len(ref_words)
     return (total_errors / total_words) * 100 if total_words > 0 else 0.0
-def compute_metrics(pred, tokenizer=None, model=None, print_pred=False, num_samples=0):
     pred_ids = pred.predictions
     label_ids = pred.label_ids
     if isinstance(pred_ids, tuple):
@@ -1146,21 +1286,25 @@ def compute_metrics(pred, tokenizer=None, model=None, print_pred=False, num_samp
         if not isinstance(pred_ids, torch.Tensor):
             pred_ids = torch.tensor(pred_ids)
         pred_ids = pred_ids.argmax(dim=-1)
     pred_ids = pred_ids.tolist()
     label_ids = label_ids.tolist()
     pad_token_id = tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else 0
     label_ids = [[pad_token_id if token == -100 else token for token in seq] for seq in label_ids]
-    def strip_trailing(seq, pad_token_id):
-        while seq and seq[-1] == pad_token_id:
-            seq = seq[:-1]
-        return seq
-    pred_ids = [strip_trailing(seq, pad_token_id) for seq in pred_ids]
-    label_ids = [strip_trailing(seq, pad_token_id) for seq in label_ids]
     if print_pred:
         for i in range(min(num_samples, len(pred_ids))):
-            print(f"Pred: '{tokenizer.batch_decode([pred_ids[i]])[0]}'")
-            print(f"Label: '{tokenizer.batch_decode([label_ids[i]])[0]}'")
             print("-" * 40)
     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
     label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
     wer = wer_batch(label_str, pred_str)
@@ -1170,9 +1314,9 @@ def compute_metrics(pred, tokenizer=None, model=None, print_pred=False, num_samp
     else:
         trainable_params = 0.0
         efficiency_score = 0.0
     return {
         "wer": float(wer),
-        "trainable_params_M": float(trainable_params),
         "efficiency_score": float(efficiency_score),
     }
@@ -1183,10 +1327,13 @@ def main():
     tokenizer = setup_tokenizer(token)
     train_dataset, test_dataset = prepare_datasets(tokenizer, token)
     param = Dimensions(
-        mels=128, aud_ctx=1500, aud_head=4, aud_dims=512, aud_idx=4,
-        vocab=40000, text_ctx=512, text_head=4, text_dims=512, text_idx=4,
-        act="swish", debug={"radius"}, cross_attn=True, features=["spectrogram"]
-    )
     model = Echo(param).to('cuda')
     print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
     print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
@@ -1202,7 +1349,6 @@ def main():
         logging_steps=10,
         logging_dir=log_dir,
         eval_strategy="steps",
         save_strategy="steps",
         report_to=["tensorboard"],
         push_to_hub=False,
@@ -1214,17 +1360,28 @@ def main():
         batch_eval_metrics=False,
     )
     from functools import partial
-    metrics_fn = partial(compute_metrics, print_pred=True, num_samples=2, tokenizer=tokenizer, model=model)
     trainer = Seq2SeqTrainer(
         args=training_args,
         model=model,
-        train_dataset=train_dataset,
-        eval_dataset=test_dataset,
-        data_collator=DataCollator(tokenizer=tokenizer),
         compute_metrics=metrics_fn,
     )
     model.init_weights()
     trainer.train()
 if __name__ == "__main__":
-    main()

 import torch.nn.functional as F
 import torch.nn.init as init
 from torch import nn, Tensor
+import matplotlib.pyplot as plt
+from typing import Optional, Dict, Union, List, Tuple, Any
 import numpy as np
+from functools import partial
+from datetime import datetime
+from datasets import load_dataset, Audio
+from transformers.trainer_seq2seq import Seq2SeqTrainer
+from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
 import transformers
 from dataclasses import dataclass
 from opimizer import MaxFactor
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.set_float32_matmul_precision('high')
 transformers.utils.logging.set_verbosity_error()
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
+warnings.filterwarnings("ignore")
 logging.basicConfig(level=logging.ERROR)
+PATH = 'E:/hf'
+os.environ['HF_HOME'] = PATH
+os.environ['HF_DATASETS_CACHE'] = PATH
+os.environ['TORCH_HOME'] = PATH
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+def get_activation(act: str) -> nn.Module:
+    """Get activation function by name."""
+    act_map = {
+        "gelu": nn.GELU(),
+        "relu": nn.ReLU(),
+        "sigmoid": nn.Sigmoid(),
+        "tanh": nn.Tanh(),
+        "swish": nn.SiLU(),
+        "tanhshrink": nn.Tanhshrink(),
+        "softplus": nn.Softplus(),
+        "softshrink": nn.Softshrink(),
+        "leaky_relu": nn.LeakyReLU(),
+        "elu": nn.ELU()
+    }
+    return act_map.get(act, nn.GELU())
 @dataclass
 class Dimensions:
     vocab: int
+    ctx: int
+    dims: int
+    head: int
+    layer: int
     mels: int
     act: str
     debug: List[str]
     cross_attn: bool
         use_cache=False,
         return_timestamps=False)
+def plot_waveform(x=None, w=None, p=None, per=None, sample_idx=0, sr=16000, hop_length=160,
+                                 title="", markers=None, marker_labels=None,
+                                 show_voiced_regions=True, show_energy=False):
+    num_plots = sum([x is not None, w is not None, p is not None, per is not None])
+    if num_plots == 0:
+        raise ValueError("No data to plot. Please provide at least one input tensor.")
+    t_spans = []
+    if w is not None:
+        w_np = w[sample_idx].detach().cpu().numpy()
+        if w_np.ndim > 1:
+            w_np = w_np.squeeze()
+        t_spans.append(len(w_np) / sr)
+    if x is not None:
+        x_np = x[sample_idx].detach().cpu().numpy()
+        if x_np.shape[0] < x_np.shape[1]:
+            x_np = x_np.T
+        t_spans.append(x_np.shape[0] * hop_length / sr)
+    if p is not None:
+        p_np = p[sample_idx].detach().cpu().numpy()
+        if p_np.ndim > 1:
+            p_np = p_np.squeeze()
+        t_spans.append(len(p_np) * hop_length / sr)
+    if per is not None:
+        per_np = per[sample_idx].detach().cpu().numpy()
+        if per_np.ndim > 1:
+            per_np = per_np.squeeze()
+        t_spans.append(len(per_np) * hop_length / sr)
+    max_t = max(t_spans) if t_spans else 0
+    fig, axs = plt.subplots(num_plots, 1, figsize=(14, 4*num_plots), sharex=True)
+    if num_plots == 1:
+        axs = [axs]
+    if show_voiced_regions and per is not None:
+        per_np = per[sample_idx].detach().cpu().numpy()
+        if per_np.ndim > 1:
+            per_np = per_np.squeeze()
+        t_per = np.arange(len(per_np)) * hop_length / sr
+        threshold = 0.5
+        for ax in axs:
+            for i in range(len(per_np)-1):
+                if per_np[i] > threshold:
+                    ax.axvspan(t_per[i], t_per[i+1], color='lightblue', alpha=0.2, zorder=0)
+    cu_ax = 0
+    if w is not None:
+        w_np = w[sample_idx].detach().cpu().numpy()
+        if w_np.ndim > 1:
+            w_np = w_np.squeeze()
+        t = np.arange(len(w_np)) / sr
+        axs[cu_ax].plot(t, w_np, color="tab:blue")
+        if show_energy:
+            frame_length = hop_length
+            hop_length_energy = hop_length // 2
+            energy = []
+            for i in range(0, len(w_np)-frame_length, hop_length_energy):
+                frame = w_np[i:i+frame_length]
+                energy.append(np.sqrt(np.mean(frame**2)))
+            energy = np.array(energy)
+            energy = energy / np.max(energy) * 0.8 * max(abs(w_np.min()), abs(w_np.max()))
+            t_energy = np.arange(len(energy)) * hop_length_energy / sr
+            axs[cu_ax].plot(t_energy, energy, color="red", alpha=0.7, label="Energy")
+            axs[cu_ax].legend(loc='upper right')
+        axs[cu_ax].set_title("Waveform")
+        axs[cu_ax].set_ylabel("Amplitude")
+        axs[cu_ax].set_xlim([0, max_t])
+        axs[cu_ax].grid(True, axis='x', linestyle='--', alpha=0.3)
+        cu_ax += 1
+    if x is not None:
+        x_np = x[sample_idx].detach().cpu().numpy()
+        if x_np.shape[0] < x_np.shape[1]:
+            x_np = x_np.T
+        axs[cu_ax].imshow(x_np.T, aspect="auto", origin="lower", cmap="magma",
+                                   extent=[0, x_np.shape[0]*hop_length/sr, 0, x_np.shape[1]])
+        axs[cu_ax].set_title("Spectrogram")
+        axs[cu_ax].set_ylabel("Mel Bin")
+        axs[cu_ax].set_xlim([0, max_t])
+        axs[cu_ax].grid(True, axis='x', linestyle='--', alpha=0.3)
+        cu_ax += 1
+    if p is not None:
+        p_np = p[sample_idx].detach().cpu().numpy()
+        if p_np.ndim > 1:
+            p_np = p_np.squeeze()
+        t_p = np.arange(len(p_np)) * hop_length / sr
+        axs[cu_ax].plot(t_p, p_np, color="tab:green")
+        axs[cu_ax].set_title("Pitch")
+        axs[cu_ax].set_ylabel("Frequency (Hz)")
+        axs[cu_ax].set_xlim([0, max_t])
+        axs[cu_ax].grid(True, axis='both', linestyle='--', alpha=0.3)
+        axs[cu_ax].set_ylim([0, min(1000, p_np.max() * 1.2)])
+        cu_ax += 1
+    if per is not None:
+        per_np = per[sample_idx].detach().cpu().numpy()
+        if per_np.ndim > 1:
+            per_np = per_np.squeeze()
+        t_per = np.arange(len(per_np)) * hop_length / sr
+        axs[cu_ax].plot(t_per, per_np, color="tab:red")
+        axs[cu_ax].set_title("Period (Voice Activity)")
+        axs[cu_ax].set_ylabel("periodocity")
+        axs[cu_ax].set_xlim([0, max_t])
+        axs[cu_ax].grid(True, axis='both', linestyle='--', alpha=0.3)
+        axs[cu_ax].set_ylim([-0.05, 1.05])
+        axs[cu_ax].axhline(y=0.5, color='k', linestyle='--', alpha=0.3)
+    if markers is not None:
+        for i, t in enumerate(markers):
+            label = marker_labels[i] if marker_labels and i < len(marker_labels) else None
+            for ax in axs:
+                ax.axvline(x=t, color='k', linestyle='-', alpha=0.7, label=label if i == 0 else None)
+        if marker_labels:
+            axs[0].legend(loc='upper right', fontsize='small')
+    axs[-1].set_xlabel("t (s)")
+    fig.suptitle(title, fontsize=16)
+    plt.tight_layout(rect=[0, 0, 1, 0.97]) # type: ignore
+    plt.show()
+    return fig
+def valid(default_value, *items):
+    """Get first non-None item"""
+    for item in items:
+        if item is not None:
+            return item
+    return default_value
 def dict_to(d, device, dtype=dtype):
     return {k: v.to(device, dtype) if isinstance(v, torch.Tensor) else v
             for k, v in d.items()}
         self.eps = eps
         self.elementwise_affine = elementwise_affine
         if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.empty(self.normalized_shape))  # type: ignore
             init.ones_(self.weight)
         else:
             self.register_parameter("weight", None)
     def forward(self, x):
+        return F.rms_norm(x, self.normalized_shape, self.weight, self.eps)  # type: ignore
 def LayerNorm(x: Tensor, normalized_shape: Union[int, Tensor, List, Tuple],
                weight: Optional[Tensor] = None, bias: Optional[Tensor] = None,
                eps: float = 1e-5) -> Tensor:
+    return F.layer_norm(x, normalized_shape, weight, bias, eps)  # type: ignore
 def get_device():
     return torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     scaled_t = torch.arange(length)[:, np.newaxis] * inv_tscales[np.newaxis, :]
     return torch.cat([torch.sin(scaled_t), torch.cos(scaled_t)], dim=1)
 class rotary(nn.Module):
+    def __init__(self, dims, head, max_ctx=1500, radii=True, debug: List[str] = [], use_pbias=False):
         super(rotary, self).__init__()
         self.use_pbias = use_pbias
         self.counter = 0
         self.last_theta = None
+        self.bias = nn.Parameter(torch.zeros(max_ctx, dims // 2), requires_grad=True if use_pbias else False)
+        theta = (torch.tensor(10000, device=device, dtype=dtype))
+        self.theta = nn.Parameter(theta, requires_grad=True)
+        self.theta_values = []
+    def mel_scale_scalar(self, freq: float) -> float:
         return 1127.0 * math.log(1.0 + freq / 700.0)
+    def mel_scale(self, freq: Tensor) -> Tensor:
         return 1127.0 * (1.0 + freq / 700.0).log()
     def pitch_bias(self, f0):
         if f0 is None:
             return None
                                     f0_norm.unsqueeze(1)))
         return f0_sim.unsqueeze(0).unsqueeze(0)
+    def theta_freqs(self, theta):
+        if theta.dim() == 0:
+            theta = theta.unsqueeze(0)
+        freq = (theta.unsqueeze(-1) / 220.0) * 700 * (
+            torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)),
+                    self.dim // 2, device=theta.device, dtype=theta.dtype) / 2595) - 1) / 1000
+        return freq
+    def _apply_radii(self, freqs, f0, ctx):
+        if self.radii and f0 is not None:
+            radius = f0.to(device, dtype)
+            L = radius.shape[0]
+            if L != ctx:
+                F = L / ctx
+                idx = torch.arange(ctx, device=f0.device)
+                idx = (idx * F).long().clamp(0, L - 1)
+                radius = radius[idx]
+            return torch.polar(radius.unsqueeze(-1), freqs)
+        else:
+            return torch.polar(torch.ones_like(freqs), freqs)
     def forward(self, x=None, enc=None, layer=None, feature_type="audio") -> Tensor:
         f0 = enc.get("f0") if enc is not None else None
         if isinstance(x, int):
             ctx = x
         elif isinstance(x, torch.Tensor) and x.ndim == 2:
         elif isinstance(x, torch.Tensor) and x.ndim == 3:
             batch, ctx, dims = x.shape
         else:
+            batch, head, ctx, head_dim = x.shape # type: ignore
         if f0 is not None:
+            if f0.dim() == 2:
+                f0 = f0.squeeze(0)
+            theta = f0 + self.theta
         else:
             theta = self.theta
         freqs = self.theta_freqs(theta)
+        t = torch.arange(ctx, device=device, dtype=dtype)
+        freqs = t[:, None] * freqs
         if self.radii and f0 is not None:
             radius = f0.to(device, dtype)
+            freqs = torch.polar(radius.unsqueeze(-1), freqs)
         else:
+            radius = torch.ones_like(freqs)
+            freqs = torch.polar(radius, freqs)
+        if "radius" in self.debug and self.counter == 10:
+            theta_value = theta.mean()
+            radius_shape = radius.shape if 'radius' in locals() else "N/A"
+            radius_mean = radius.mean() if 'radius' in locals() else 0.0
+            print(f"  [{layer}] [Radius] {radius_shape} {radius_mean:.2f} [Theta] {theta_value:.2f} [f0] {f0.shape if f0 is not None else None} [Freqs] {freqs.shape} {freqs.mean():.2f} [ctx] {ctx}")
+            print(f"  [{layer}] [Radius] {radius}")
+        # self.theta_values.append(theta.item())
         self.counter += 1
         return freqs.unsqueeze(0)
         x1 = x1.view(orig_shape)
         return torch.cat([x1.type_as(x), x2], dim=-1)
 class MultiheadA(nn.Module):
     rbf = False
     def __init__(self, dims: int, head: int, rotary_emb: bool = True,
+                 zero_val: float = 1e-7, minz: float = 1e-8, maxz: float = 1e-6, debug: List[str] = [], optim_attn=False, use_pbias=False):
         super(MultiheadA, self).__init__()
         self.dims = dims
                 )
         else:
             self.rope = None
+    def cos_sim(self, q: Tensor, k: Tensor, v: Tensor, mask) -> Tensor:
+        q_norm = torch.nn.functional.normalize(q, dim=-1, eps=1e-12)
+        k_norm = torch.nn.functional.normalize(k, dim=-1, eps=1e-12)
+        qk_cosine = torch.matmul(q_norm, k_norm.transpose(-1, -2))
+        qk_cosine = qk_cosine + mask
+        weights = F.softmax(qk_cosine, dim=-1)
+        out = torch.matmul(weights, v)
+        return out
+    def rbf_scores(self, q, k, rbf_sigma=1.0, rbf_ratio=0.0):
+        scale = (self.dims // self.head) ** -0.25
+        dot_scores = torch.matmul(q, k.transpose(-1, -2)) * scale
+        if rbf_ratio <= 0.0:
+            return dot_scores
+        q_norm = q.pow(2).sum(dim=-1, keepdim=True)
+        k_norm = k.pow(2).sum(dim=-1, keepdim=True)
+        qk = torch.matmul(q, k.transpose(-1, -2))
+        dist_sq = q_norm + k_norm.transpose(-1, -2) - 2 * qk
+        rbf_scores = torch.exp(-dist_sq / (2 * rbf_sigma**2))
+        return (1 - rbf_ratio) * dot_scores + rbf_ratio * rbf_scores
+    def forward(self, x: Tensor, xa: Optional[Tensor] = None, mask: Optional[Tensor] = None, enc = None, layer = None, feature_type="audio", need_weights=True) -> tuple:
         x = x.to(device, dtype)
         if xa is not None:
             q2 = q.shape[2]
             k2 = k.shape[2]
+            q = self.rope.apply_rotary(q, (self.rope(x=q2, enc=enc, layer=layer)))  # type: ignore
+            k = self.rope.apply_rotary(k, (self.rope(x=k2, enc=enc, layer=layer)))  # type: ignore
         else:
             q = q.view(*q.shape[:2], self.head, -1).permute(0, 2, 1, 3)
             k = k.view(*k.shape[:2], self.head, -1).permute(0, 2, 1, 3)
         qk = (q * scale) @ (k * scale).transpose(-1, -2)
+        if self.rbf:
+            qk = self.rbf_scores(q * scale, k * scale, rbf_sigma=1.0, rbf_ratio=0.3)
+        if self.use_pbias:
+            pbias = self.rope.pitch_bias(f0 = enc.get("f0", None) if enc is not None else None)  # type: ignore
+            if pbias is not None:
+                qk = qk + pbias[:,:,:q2,:q2]
         token_ids = k[:, :, :, 0]
         zscale = torch.ones_like(token_ids)
         fzero = torch.clamp(F.softplus(self.fzero), self.minz, self.maxz)
         zscale[token_ids.float() == self.pad_token] = fzero
         if mask is not None:
+            # mask = mask[:q2, :q2]#torch.tril(torch.ones(q2, q2, device=q.device))
+            # audio_mask = torch.ones(q2, k2 - q2, device=q.device)
+            # mask = torch.cat([mask, audio_mask], dim=-1)
+            mask = mask.unsqueeze(0).unsqueeze(0)
+            qk = qk + mask * zscale.unsqueeze(-2).expand(qk.shape)
         qk = qk * zscale.unsqueeze(-2)
         w = F.softmax(qk, dim=-1).to(q.dtype)
         wv = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
         return self.o(wv), qk
 class t_gate(nn.Module):
+    def __init__(self, dims, num_types=4, enabled=True):
         super().__init__()
+        self.enabled = enabled
         self.gate_projections = nn.ModuleList([
             nn.Sequential(Linear(dims, 1), nn.Sigmoid())
             for _ in range(num_types)])
             Linear(dims, num_types),
             nn.Softmax(dim=-1))
     def forward(self, x):
+        if not self.enabled:
+            return None
         type_probs = self.type_classifier(x)
         gates = torch.stack([gate(x) for gate in self.gate_projections], dim=-1)
         comb_gate = torch.sum(gates * type_probs.unsqueeze(2), dim=-1)
         return comb_gate
 class m_gate(nn.Module):
+    def __init__(self, dims, mem_size=64, enabled=True):
         super().__init__()
+        self.enabled = enabled
+        if enabled:
+            self.m_key = nn.Parameter(torch.randn(mem_size, dims))
+            self.m_val = nn.Parameter(torch.randn(mem_size, 1))
+            self.gate_proj = nn.Sequential(Linear(dims, dims//2), nn.SiLU(), Linear(dims//2, 1))
     def forward(self, x):
+        if not self.enabled:
+            return None
         d_gate = torch.sigmoid(self.gate_proj(x))
         attention = torch.matmul(x, self.m_key.transpose(0, 1))
         attention = F.softmax(attention / math.sqrt(x.shape[-1]), dim=-1)
         return 0.5 * (d_gate + m_gate)
 class c_gate(nn.Module):
+    def __init__(self, dims, enabled=True):
         super().__init__()
+        self.enabled = enabled
+        if enabled:
+            self.s_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
+            self.w_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
+            self.p_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
+            self.e_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
+            self.ph_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
+            self.integ = Linear(dims*5, dims)
     def forward(self, x, features):
+        if not self.enabled:
+            return None
         s_feat = features.get("spectrogram", x)
         w_feat = features.get("waveform", x)
         p_feat = features.get("pitch", x)
         comb = torch.cat([s, w, p, e, ph], dim=-1)
         return self.integ(comb)
+class mlp_gate(nn.Module):
+    def __init__(self, dims, enabled=True):
+        super().__init__()
+        self.enabled = enabled
+        if enabled:
+            self.gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
+    def forward(self, x):
+        if not self.enabled:
+            return None
+        return self.gate(x)
 class Residual(nn.Module):
     _seen = set()
+    def __init__(self, ctx, dims, head, act, debug: List[str] = [],
                  tgate=True, mgate=False, cgate=False, mem_size=512, features=None):
         super().__init__()
         self.head = head
         self.ctx = ctx
         self.head_dim = dims // head
         self.features = features
         self.debug = debug
         self.counter = 0
         self.dropout = 0.01
         self.blend = nn.Parameter(torch.tensor(0.5))
+        act_fn = get_activation(act)
+        self.attn = MultiheadA(dims, head, rotary_emb=True, debug=debug)
+        if not any([tgate, mgate, cgate]):
+            self.mlp_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
+        else:
+            self.mlp_gate = None
         mlp = dims * 4
         self.mlp = nn.Sequential(Linear(dims, mlp), act_fn, Linear(mlp, dims))
+        self.t_gate = t_gate(dims=dims, num_types=4*2, enabled=tgate)
+        self.m_gate = m_gate(dims=dims, mem_size=mem_size, enabled=mgate)
+        self.c_gate = c_gate(dims=dims, enabled=cgate)
+        self.mlp_gate = mlp_gate(dims=dims, enabled=not any([tgate, mgate, cgate]))
         self.lna = RMSNorm(dims)
+        self.lnb = RMSNorm(dims)
         self.lnc = RMSNorm(dims)
     def forward(self, x, xa=None, mask=None, enc=None, layer=None, feature_type="audio") -> Tensor:
+        b = torch.sigmoid(self.blend)
+        ax = x + self.attn(self.lna(x), xa=xa, mask=mask, enc=enc, layer=layer)[0]
+        bx = b * ax + (1 - b) * x
+        cx = self.lnb(bx)
+        dx = self.mlp(cx)
+        ex = self.t_gate(cx) if not None else self.default(self.m_gate(cx), self.mlp_gate(cx))
+        fx = x + ex + dx
+        gx = self.lnc(fx)
+        return gx
 class FEncoder(nn.Module):
     def __init__(self, input_dims, dims, head, layer, kernel_size, act, stride=1, use_rope=False, spec_shape=None):
         super().__init__()
         self.use_rope = use_rope
         self.dims = dims
+        act_fn = get_activation(act)
         self.encoder = nn.Sequential(
             Conv1d(input_dims, dims, kernel_size=kernel_size, stride=stride, padding=kernel_size//2), act_fn,
             if spec_shape is not None:
                 self.rope = rotary(
                     dims=self.head_dim,
+                    head=self.head,
                     use_2d_axial=True,
                     spec_shape=spec_shape, debug=[])
             else:
                 self.rope = rotary(
                     dims=self.head_dim,
+                    head=self.head,
                     use_2d_axial=False, debug=[])
         else:
             self.rope = None
             feature_type = "spectrogram"
         batch, ctx, dims = x.shape
         x = x.view(batch, ctx, self.head, self.head_dim).permute(0, 2, 1, 3)
+        if feature_type == "spectrogram" and self.rope is not None:
             rope_freqs = self.rope(ctx, layer=layer, input_type="spectrogram")
         else:
             rope_freqs = self.rope(ctx, layer=layer, input_type="audio")
         self.use_rope = use_rope
         self.dims = dims
+        act_fn = get_activation(act)
         self.downsample = nn.Sequential(
             Conv1d(input_dims, dims//8, kernel_size=15, stride=8, padding=7), act_fn,
         if use_rope:
             self.rope = rotary(
                 dims=self.head_dim,
+                head=self.head,
+                debug=[])
         else:
             self.rope = None
             self.positional = lambda length: sinusoids(length, dims)
         self.use_rope = use_rope
         self.dims = dims
+        act_fn = get_activation(act)
         self.encoder = nn.Sequential(
             Conv1d(input_dims, dims//4, kernel_size=7, stride=8, padding=3), act_fn,
         if use_rope:
             self.rope = rotary(
                 dims=self.head_dim,
+                head=self.head,
+                debug=[])
         else:
             self.rope = None
             self.positional = lambda length: sinusoids(length, dims)
         x = self.norm(x)
         return x
+class SpeechTransformer(nn.Module):
     _seen = set()
+    def __init__(self, vocab: int, mels: int, ctx: int, dims: int, head: int, layer: int, debug: List[str], features: List[str], act: str = "gelu"):
+        super(SpeechTransformer, self).__init__()
         self.dims = dims
         self.head = head
         self.counter = 0
         self.features = features
         self.dropout = 0.01
+        self.sequential = "sequential" in debug
+        act_fn = get_activation(act)
+        self.token = nn.Embedding(vocab, dims, device=device, dtype=dtype)
+        self.positional = nn.Parameter(torch.empty(ctx, dims, device=device, dtype=dtype), requires_grad=True)
+        self.register_buffer("audio_embedding", sinusoids(ctx, dims))
         if features == ["spectrogram", "waveform", "pitch"]:
             cgate=True
             if "phase" in features else None),
             })
         self.block = nn.ModuleList([
+            Residual(ctx=ctx, dims=dims, head=head, act="gelu", debug=debug, features=features)
+            for _ in range(layer)])
+        self.blend = nn.Parameter(torch.tensor(0.5))
         self.ln_dec = RMSNorm(dims)
+        def get_mask(text_ctx, aud_ctx):
+            mask = torch.tril(torch.ones(text_ctx, text_ctx, device=device), diagonal=0)
+            audio_mask = torch.ones(text_ctx, aud_ctx - text_ctx, device=device)
+            full_mask = torch.cat([mask, audio_mask], dim=-1)
+            return full_mask
+        self.register_buffer("mask_ax", get_mask(ctx, ctx), persistent=False)
         mask = torch.tril(torch.ones(ctx, ctx), diagonal=0)
         self.register_buffer("mask", mask, persistent=False)
+    def forward(self, enc, layer="encoder"):
+        enc = dict_to(enc, device, dtype)
+        x = enc.get("input_ids").long()
         x = self.token(x) + self.positional[:x.shape[1]]
         x = F.dropout(x, p=self.dropout, training=self.training)
+        out = {}
+        out.update(enc)
+        for f in self.features:
+            if f in enc and f in self.blocks:
+                xa = enc[f]
+                for block in self.blocks[f]: # type: ignore
+                    xa = block(xa, enc=enc, layer=layer)
+                out[f] = xa
+                xa = xa + self.audio_embedding[:xa.shape[1]]
         for block in self.block:
+            mask = self.mask[:x.shape[1], :x.shape[1]]
             x = block(x, xa=None, mask=mask, enc=None, layer=layer)
+        for f in self.features:
             if f in enc:
+                mask = self.mask_ax[:x.shape[1], :xa.shape[1]]
+                for block in self.block:
+                    out = block(x, xa=xa, mask=mask, enc=None, layer=layer)
                 if self.sequential:
                     x = out
                 else:
+                    a = torch.sigmoid(self.blend)
                     x = a * out + (1 - a) * x
         x = self.ln_dec(x)
         return x @ torch.transpose(self.token.weight.to(dtype), 0, 1).float()
         super().__init__()
         self.param = param
+        self.SpeechTransformer = SpeechTransformer(
+            vocab=param.vocab,
             mels=param.mels,
+            ctx=param.ctx,
+            dims=param.dims,
+            head=param.head,
+            layer=param.layer,
             debug=param.debug,
             features=param.features,
+            act=param.act,
             )
     def forward(self,
         labels=None,
         input_ids=None,
+        waveform: Optional[torch.Tensor]=None,
+        spectrogram: Optional[torch.Tensor]=None,
         pitch: Optional[torch.Tensor]=None,
         f0: Optional[torch.Tensor]=None,
         envelope: Optional[torch.Tensor]=None,
         phase: Optional[torch.Tensor]=None,
+        ) -> Dict[str, Optional[torch.Tensor]]:
         encoder_inputs = {}
         if spectrogram is not None:
             encoder_inputs["phase"] = phase
         if f0 is not None:
             encoder_inputs["f0"] = f0
+        if input_ids is not None:
+            encoder_inputs["input_ids"] = input_ids
+        logits = self.SpeechTransformer(encoder_inputs)
         loss = None
         if labels is not None:
         std = 0.02
         self.init_counts = {
             "Linear": 0, "Conv1d": 0, "LayerNorm": 0, "RMSNorm": 0,
+            "Conv2d": 0, "SEBlock": 0, "SpeechTransformer": 0,
             "Residual": 0, "MultiheadA": 0, "MultiheadB - Cross Attention": 0,
             "MultiheadC": 0, "MultiheadD": 0, "FEncoder": 0,
             "WEncoder": 0, "PEncoder": 0}
                     nn.init.zeros_(module.bias)
                 self.init_counts["Conv2d"] += 1
             elif isinstance(module, MultiheadA):
                 self.init_counts["MultiheadA"] += 1
+            elif isinstance(module, SpeechTransformer):
+                self.init_counts["SpeechTransformer"] += 1
             elif isinstance(module, Residual):
                 self.init_counts["Residual"] += 1
             encoder_inputs["phase"] = phase
         if f0 is not None:
             encoder_inputs["f0"] = f0
         for i in range(max_length - 1):
             with torch.no_grad():
+                encoder_inputs["input_ids"] = ids
+                logits = self.SpeechTransformer(encoder_inputs)
             next_token_logits = logits[:, -1, :]
             if i < min_length:
                 next_token_logits[:, eos_token_id] = 0
                 })
         return Config()
+def setup_tokenizer(token: str):
     from tokenizers import Tokenizer
+    tokenizer = Tokenizer.from_file("./tokenizer.json")
     orig_encode = tokenizer.encode
     def enc(text, add_special_tokens=True):
         ids = orig_encode(text).ids
                     ids = ids[1:]
                 while ids and ids[-1] in [0, 2]:
                     ids = ids[:-1]
+            if isinstance(ids, torch.Tensor):
+                ids = ids.tolist()
+            elif isinstance(ids, np.ndarray):
+                ids = ids.tolist()
             results.append(tokenizer.decode(ids))
         return results
     tokenizer.eos_token_id = 2
     return tokenizer
+def load_wave(wave_data, sample_rate):
+    if isinstance(wave_data, str):
+        waveform, sr = torchaudio.load(uri=wave_data, normalize=False)
+    elif isinstance(wave_data, dict):
+        waveform = torch.tensor(data=wave_data["array"]).float()
+        sr = wave_data["sampling_rate"]
+    else:
+        raise TypeError("Invalid wave_data format.")
+    return waveform
+def extract_features(batch, tokenizer, sample_rate=16000, hop_length=256, **dataset_config):
     audio = batch["audio"]
+    sr = audio["sampling_rate"]
+    wav = load_wave(wave_data=audio, sample_rate=sr)
+    dataset_config = {
+        "hop_length": 256,
+        "f_min": 150,
+        "f_max": 2000,
+        "n_mels": 128,
+        "n_fft": 1024,
+        "sample_rate": 16000,
+        "pad_mode": "constant",
+        "center": True,
+        "power": 1.0,
+        "window_fn": torch.hann_window,
+        "mel_scale": "htk",
+        "norm": None,
+        "normalized": False}
+    transform = torchaudio.transforms.MelSpectrogram(
+        **dataset_config
+        )
+    mel_spectrogram = transform(wav)
+    log_mel = torch.clamp(mel_spectrogram, min=1e-10).log10()
+    log_mel = torch.maximum(log_mel, log_mel.max() - 8.0)
+    spec = (log_mel + 4.0) / 4.0
+    spec = torch.tensor(spec)
+    # batch["spectrogram"] = spec
+    wav_np = wav.numpy().astype(np.float64)
     f0, t = pw.dio(wav_np, sample_rate, frame_period=hop_length/sample_rate*1000)
     f0 = pw.stonemask(wav_np, f0, t, sample_rate)
+    f0 = torch.from_numpy(f0)
+    labels = tokenizer.encode(batch["transcription"])
     return {
         "spectrogram": spec,
         "f0": f0,
+        "labels": labels,
+        # "waveform": wav,
+        # "pitch": f0,
     }
+def prepare_datasets(tokenizer, token, sanity_check=False, sample_rate=16000, **dataset_config):
+        if sanity_check:
+            test = load_dataset(
+                "google/fleurs", "en_us", token=token, split="test[:10]", trust_remote_code=True
+            ).cast_column("audio", Audio(sample_rate=sample_rate))
+            dataset = test.map(
+                lambda x: extract_features(x, tokenizer, **dataset_config),
+                remove_columns=test.column_names)
+            dataset = dataset(remove_columns=["audio", "transcription"]).with_format(type="torch")
+            train_dataset = dataset
+            test_dataset = dataset
+        else:
+            cache_dir = "./processed_datasets"
+            os.makedirs(cache_dir, exist_ok=True)
+            cache_file_train = os.path.join(cache_dir, "train.arrow")
+            cache_file_test = os.path.join(cache_dir, "test.arrow")
+            if os.path.exists(cache_file_train) and os.path.exists(cache_file_test):
+                from datasets import Dataset
+                train_dataset = Dataset.load_from_disk(cache_file_train)
+                test_dataset = Dataset.load_from_disk(cache_file_test)
+                return train_dataset, test_dataset
+        def filter_func(x):
+            return (0 < len(x["transcription"]) < 512 and
+                   len(x["audio"]["array"]) > 0 and
+                   len(x["audio"]["array"]) < 1500 * 160)
+        raw_train = load_dataset(
+            "google/fleurs", "en_us", token=token, split="train[:1000]", trust_remote_code=True)
+        raw_test = load_dataset(
+            "google/fleurs", "en_us", token=token, split="test[:100]", trust_remote_code=True)
+        raw_train = raw_train.filter(filter_func)
+        raw_test = raw_test.filter(filter_func)
+        raw_train = raw_train.cast_column("audio", Audio(sampling_rate=sample_rate))
+        raw_test = raw_test.cast_column("audio", Audio(sampling_rate=sample_rate))
+        train_dataset = raw_train.map(
+            lambda x: extract_features(x, tokenizer, **dataset_config),
+            remove_columns=raw_train.column_names)
+        test_dataset = raw_test.map(
+            lambda x: extract_features(x, tokenizer, **dataset_config),
+            remove_columns=raw_test.column_names)
+        train_dataset.save_to_disk(cache_file_train)
+        test_dataset.save_to_disk(cache_file_test)
+        return train_dataset, test_dataset
 @dataclass
 class DataCollator:
     tokenizer: Any
     def __call__(self, features: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+        all_keys = set()
+        for f in features:
+            all_keys.update(f.keys())
+        batch = {}
         pad_token_id = getattr(self.tokenizer, 'pad_token_id', 0)
         bos_token_id = getattr(self.tokenizer, 'bos_token_id', 1)
         eos_token_id = getattr(self.tokenizer, 'eos_token_id', 2)
+        for key in all_keys:
+            if key == "labels":
+                labels_list = [f["labels"] for f in features]
+                max_len = max(len(l) for l in labels_list)
+                all_ids, all_labels = [], []
+                for label in labels_list:
+                    label_list = label.tolist() if isinstance(label, torch.Tensor) else label
+                    decoder_input = [bos_token_id] + label_list
+                    label_eos = label_list + [eos_token_id]
+                    input_len = max_len + 1 - len(decoder_input)
+                    label_len = max_len + 1 - len(label_eos)
+                    padded_input = decoder_input + [pad_token_id] * input_len
+                    padded_labels = label_eos + [pad_token_id] * label_len
+                    all_ids.append(padded_input)
+                    all_labels.append(padded_labels)
+                batch["input_ids"] = torch.tensor(all_ids, dtype=torch.long)
+                batch["labels"] = torch.tensor(all_labels, dtype=torch.long)
+            elif key in ["spectrogram", "waveform", "pitch", "f0", "envelope", "phase"]:
+                items = [f[key] for f in features if key in f]
+                items = [torch.tensor(item) if not isinstance(item, torch.Tensor) else item for item in items]
+                max_len = max(item.shape[-1] for item in items)
+                padded = []
+                for item in items:
+                    pad_width = max_len - item.shape[-1]
+                    if pad_width > 0:
+                        pad_item = F.pad(item, (0, pad_width), mode='constant', value=pad_token_id)
+                    else:
+                        pad_item = item
+                    padded.append(pad_item)
+                batch[key] = torch.stack(padded)
+                if key == "spectrogram":
+                    batch["spectrogram"] = batch[key]
+        return batch
 def levenshtein(reference_words, hypothesis_words):
     m, n = len(reference_words), len(hypothesis_words)
         total_words += len(ref_words)
     return (total_errors / total_words) * 100 if total_words > 0 else 0.0
+def compute_metrics(pred, tokenizer=None, model=None, print_pred=False, num_samples=0, optimizer=None, scheduler=None):
     pred_ids = pred.predictions
     label_ids = pred.label_ids
     if isinstance(pred_ids, tuple):
         if not isinstance(pred_ids, torch.Tensor):
             pred_ids = torch.tensor(pred_ids)
         pred_ids = pred_ids.argmax(dim=-1)
     pred_ids = pred_ids.tolist()
     label_ids = label_ids.tolist()
     pad_token_id = tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else 0
     label_ids = [[pad_token_id if token == -100 else token for token in seq] for seq in label_ids]
     if print_pred:
         for i in range(min(num_samples, len(pred_ids))):
+            pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=False)
+            label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=False)
+            print(f"Pred tokens: {pred_ids[i]}")
+            print(f"Label tokens: {label_ids[i]}")
+            print(f"Pred: '{pred_str[i]}'")
+            print(f"Label: '{label_str[i]}'")
             print("-" * 40)
     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
     label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
     wer = wer_batch(label_str, pred_str)
     else:
         trainable_params = 0.0
         efficiency_score = 0.0
     return {
         "wer": float(wer),
         "efficiency_score": float(efficiency_score),
     }
     tokenizer = setup_tokenizer(token)
     train_dataset, test_dataset = prepare_datasets(tokenizer, token)
     param = Dimensions(
+        vocab=40000, ctx=2048, dims=512, head=4, layer=4,
+        mels=128, act="swish",
+        debug={},
+        cross_attn=True,
+        features=["spectrogram"]
+        )
     model = Echo(param).to('cuda')
     print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
     print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
         logging_steps=10,
         logging_dir=log_dir,
         eval_strategy="steps",
         save_strategy="steps",
         report_to=["tensorboard"],
         push_to_hub=False,
         batch_eval_metrics=False,
     )
     from functools import partial
+    metrics_fn = partial(compute_metrics,
+    print_pred=True,
+    num_samples=1,
+    tokenizer=tokenizer, model=model)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=0.00025, eps=1e-8, weight_decay=0.025, betas=(0.9, 0.999),
+    amsgrad=False, foreach=False, fused=False, capturable=False, differentiable=False, maximize=False)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=training_args.max_steps, eta_min=1e-9, last_epoch=-1)
     trainer = Seq2SeqTrainer(
         args=training_args,
         model=model,
+        train_dataset=train_dataset, # type: ignore
+        eval_dataset=test_dataset, # type: ignore
+        data_collator=DataCollator(tokenizer=tokenizer), # type: ignore
         compute_metrics=metrics_fn,
+        optimizers=(optimizer, scheduler) # type: ignore
     )
     model.init_weights()
     trainer.train()
 if __name__ == "__main__":
+    main()