Sin2pi
/

asr-model

@@ -1,3 +1,4 @@
 import pyworld as pw
 import os
 import math, random
@@ -5,11 +6,8 @@ import warnings
 import logging
 import gzip
 import base64
-import re
-from einops import rearrange, repeat
 import torch
 import torchaudio
-import torchcrepe
 import torch.nn.functional as F
 import torch.nn.init as init
 from torch import nn, Tensor
@@ -23,21 +21,11 @@ from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
 import transformers
 import evaluate
 from dataclasses import dataclass
-from math import pi, log
-torch.backends.cudnn.allow_tf32 = True
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.set_float32_matmul_precision('high')
-transformers.utils.logging.set_verbosity_error()
 device = torch.device(device="cuda:0")
 dtype = torch.float32
-torch.set_default_dtype(dtype)
-warnings.filterwarnings("ignore")
-logging.basicConfig(level=logging.ERROR)
-tox = {"device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"), "dtype": torch.float32}
 extractor = None
 tokenizer = None
 optimizer = None
@@ -64,11 +52,6 @@ class Dimensions:
     features: List[str]
     f0_rotary: bool
-import numpy as np
-import matplotlib.pyplot as plt
 def plot_waveform(x=None, w=None, p=None, per=None, sample_idx=0, sr=16000, hop_length=160,
                                  title="", markers=None, marker_labels=None,
                                  show_voiced_regions=True, show_energy=False):
@@ -147,7 +130,6 @@ def plot_waveform(x=None, w=None, p=None, per=None, sample_idx=0, sr=16000, hop_
         axs[current_ax].set_ylabel("Mel Bin")
         axs[current_ax].set_xlim([0, max_time])
         axs[current_ax].grid(True, axis='x', linestyle='--', alpha=0.3)
-        # fig.colorbar(im, ax=axs[current_ax])
         current_ax += 1
     if p is not None:
@@ -265,6 +247,37 @@ class ParameterCycler:
             param.requires_grad = (x == self.current_idx)
             print(f"Parameter {x}: requires_grad={param.requires_grad}")
         self.current_idx = (self.current_idx + 1) % len(self.parameters)
 class rotary(nn.Module):
     _seen = set()
@@ -272,173 +285,64 @@ class rotary(nn.Module):
                  learned_radius=False, learned_theta=False, learned_pitch=False, debug: List[str] = [], use_pbias = False):
         super().__init__()
         self.use_pbias = use_pbias
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.dtype = torch.float32
         self.debug = debug
         self._counter = 0
-        self.dims = dims
         self.max_ctx = max_ctx
         self.radii = radii
         f0_factor = 0.5
-        self.learned_adaptation: bool = False
         pitch_scale = 1.0
         radius = 1
-        if self.learned_adaptation:
             self.f0_scale = nn.Parameter(torch.tensor(f0_factor, device=self.device, dtype=self.dtype), requires_grad=True)
         else:
             self.register_buffer('f0_scale', torch.tensor(f0_factor))
         self.theta = nn.Parameter(torch.tensor(theta, device=self.device, dtype=self.dtype), requires_grad=True)
         self.pitch_scale = nn.Parameter(torch.tensor(pitch_scale, device=self.device, dtype=self.dtype), requires_grad=True)
-        freqs = 1. / (theta ** (torch.arange(0, dims, 2)[:(dims // 2)].float() / dims))
         self.freqs = nn.Parameter(torch.tensor(freqs, device=self.device, dtype=self.dtype), requires_grad=True)
         self.radius = nn.Parameter(torch.ones(radius, device=self.device, dtype=self.dtype), requires_grad=True)
-        # self.cycler = ParameterCycler(parameters=[self.theta, self.pitch_scale, self.freqs])
-        # self.reset_parameters()
-    def get_pitch_bias(self, f0):
-        if f0 is None:
-            return None
-        f0_flat = f0.squeeze().float()
-        f0_norm = (f0_flat - f0_flat.mean()) / (f0_flat.std() + 1e-8)
-        f0_sim = torch.exp(-torch.cdist(f0_norm.unsqueeze(1),
-                                    f0_norm.unsqueeze(1)) * self.pitch_scale)
-        return f0_sim.unsqueeze(0).unsqueeze(0)
-    def add_to_rotary(self):
-        def get_sim(self, freqs):
-            real = freqs.real.squeeze(0)
-            imag = freqs.imag.squeeze(0)
-            vecs = torch.cat([real.unsqueeze(-2), imag.unsqueeze(-2)], dim=-1)
-            vecs = vecs.squeeze(-2)
-            return F.cosine_similarity(vecs.unsqueeze(1), vecs.unsqueeze(0), dim=-1)
-        def fwd_sim(self, x=None, f0=None):
-            freqs = self.forward(x, f0)
-            sim = get_sim(self, freqs)
-            return freqs, sim
-        rotary.get_sim = get_sim
-        rotary.fwd_sim = fwd_sim
-    def align_f0(self, f0, ctx):
-        b, l = f0.shape
-        if l == ctx:
-            return f0.squeeze(0).float()
-        frames_per_token = l / ctx
-        idx = torch.arange(ctx, device=self.device, dtype=torch.float32)
-        src_idx = (idx * frames_per_token).long().clamp(0, l-1)
-        batch_idx = torch.arange(b, device=self.device, dtype=torch.float32).unsqueeze(1)
-        f0 = f0[batch_idx, src_idx]
-        return f0.squeeze(0).float()
-    # def align_f0(self, f0, ctx):
-    #     b, l = f0.shape
-    #     if l == ctx:
-    #         return f0.squeeze(0).float()
-    #     frames = l / ctx
-    #     idx = torch.arange(ctx, device=f0.device)
-    #     f0 = (idx * frames).long()
-    #     # b_idx = torch.arange(b, device=f0.device).unsqueeze(1)
-    #     # f0 = f0[b_idx, idx.unsqueeze(0).expand(b, -1)]
-    #     return f0.squeeze(0).float()
-    def scale_f0(self, f0):
-        f0_min = f0.min(dim=1, keepdim=True)[0]
-        f0_max = f0.max(dim=1, keepdim=True)[0]
-        denom = f0_max - f0_min + 1e-8
-        normalized_f0 = (f0 - f0_min) / denom
-        normalized_f0 = torch.clamp(normalized_f0, 0.0, 1.0)
-        return normalized_f0
-    def process_f0(f0, threshold=0.05):
-        thresholded_f0 = torch.where(f0 < threshold, torch.zeros_like(f0), f0)
-        return thresholded_f0
-    def map_perceptual(self, f0_mean, theta=10000.0):
-        if f0_mean >= theta:
-            return torch.log(f0_mean / theta)
-        else:
-            return -torch.log(theta / f0_mean)
-    def linear_map(self, freq, min_freq=40.0, max_freq=400.0, target_max=10000.0):
-        mapped_freq = ((freq - min_freq) / (max_freq - min_freq)) * target_max
-        return mapped_freq
-    def log_map(self, freq, min_freq=40.0, max_freq=400.0, target_max=10000.0):
-        log_freq = torch.log(freq)
-        log_min_freq = torch.log(min_freq)
-        log_max_freq = torch.log(max_freq)
-        mapped_log_freq = ((log_freq - log_min_freq) / (log_max_freq - log_min_freq)) * torch.log(torch.tensor(target_max, device=self.device))
-        return mapped_log_freq
-    def get_f0_adapted_freqs(self, ctx, f0=None):
-        f0_min: float = 80.0,
-        f0_max: float = 500.0,
-        base_freq: float = 1.0,
-        positions = torch.arange(ctx, device=device, dtype=torch.float)
-        freqs = base_freq.clone()
-        if f0 is not None:
-            f0_norm = torch.clamp((f0 - f0_min) / (f0_max - f0_min), 0.0, 1.0)
-            freq_mod = torch.pow(torch.linspace(0.5, 1.5, self.dims//2, device=device),
-                                f0_norm.unsqueeze(-1) * self.f0_scale)
-            freqs = freqs * freq_mod
-        freqs = torch.outer(positions, freqs)
-        return torch.polar(torch.ones_like(freqs), freqs)
-    def forward(self, x=None, f0=None, layer=None) -> Tensor:
-        # self.cycler.toggle_requires_grad()
         if isinstance(x, int):
             ctx = x
         else:
             batch, ctx, dims = x.shape
         t = torch.arange(ctx, device=self.device).float()
-        if self.learned_adaptation:
-            freqs = self.get_f0_adapted_freqs(ctx, f0)
-            x_complex = torch.view_as_complex(
-                x.float().reshape(*x.shape[:-1], -1, 2).contiguous())
-            x_rotated = x_complex * freqs.unsqueeze(0).unsqueeze(0)
-            freqs = torch.view_as_real(x_rotated).flatten(3).type_as(x)
         if f0 is not None:
             f0_mean=f0.mean()+1e-8
-            pitch_scale=self.pitch_scale
-            theta=f0_mean*pitch_scale
-            freqs = 1.0 / (theta ** (torch.arange(0, self.dims, 2, device=self.device) / self.dims))
         else:
             freqs = self.freqs
         freqs = torch.einsum('i,j->ij', t, freqs)
         freqs = freqs.float()
-        if self.radii and f0 is not None:
-            radius = self.align_f0(f0, ctx)
-            # radius = torch.clamp(radius, min=50.0, max=500.0)  # Clamp to voice range
-            # radius = radius / 500.0  # Normalize to [0.1, 1.0] range
-            # radius = radius.float()
             radius = radius.float()
-            freqs = torch.polar(radius.unsqueeze(-1), freqs)
         else:
-            freqs = torch.polar(torch.ones_like(freqs), freqs.unsqueeze(0))
         if "rotary" in self.debug:
             if f0 is not None:
                 key = f"{self._counter}_{theta:.2f}"
                 if key not in rotary._seen:
                     if not hasattr(self, '_prev_f0_theta'):
                         self._prev_f0_theta = theta
-                        print(f"Step {self._counter}: Using raw F0 as theta: {theta:.2f} Hz")
                     elif abs(self._prev_f0_theta - theta) > 100.0:
-                        print(f"Step {self._counter}: Using raw F0 as theta: {theta:.2f} Hz")
-                        print(f"f0_mean: {f0_mean} Hz, freqs: {freqs.shape}, ctx: {ctx}, dims: {self.dims}, block: {layer}")
-                    if self.radii:
-                        print(f"radius: {radius} Hz, enc: {layer} Hz, ctx: {ctx}")
                         self._prev_f0_theta = theta
                     rotary._seen.add(key)
             self._counter += 1
@@ -474,55 +378,6 @@ class rotary(nn.Module):
                 x1 = torch.view_as_real(x1).flatten(-2)
                 return torch.cat([x1.type_as(x), x2], dim=-1)
-def optim_attn(q, k, v, mask=None, scale=None, pad_token=0, fzero_val=0.0001):
-    batch, heads, ctx, dims = q.shape
-    token_ids = k[:, :, :, 0]
-    is_padding = (token_ids.float() == pad_token).unsqueeze(-2)
-    log_scale_factor = -10.0
-    attn_mask = torch.zeros((batch, heads, ctx, ctx), device=q.device)
-    if mask is not None:
-        attn_mask = attn_mask + mask.unsqueeze(0).unsqueeze(0)
-    attn_mask = torch.where(is_padding,
-                            torch.tensor(log_scale_factor, device=q.device),
-                            attn_mask)
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        q, k, v, attn_mask=attn_mask,
-        dropout_p=0.0, is_causal=False)
-    attn_output = attn_output.permute(0, 2, 1, 3).flatten(start_dim=2)
-    return attn_output
-def parallel_slice(self, q, k, v, mask=None):
-    batch, head, ctx, dims = q.shape
-    head_dim = self.head_dim
-    batch, ctx, dims = q.shape
-    ctx_len = k.shape[1]
-    head = dims // head_dim
-    scores = torch.zeros(batch, head, ctx, ctx_len, device=q.device)
-    for h in range(head):
-        start_idx = h * head_dim
-        end_idx = start_idx + head_dim
-        q_h = q[:, :, start_idx:end_idx]
-        k_h = k[:, :, start_idx:end_idx]
-        scores[:, h] = torch.bmm(q_h, k_h.transpose(1, 2)) / math.sqrt(head_dim)
-    if mask is not None:
-        scores = scores + mask.unsqueeze(0).unsqueeze(0)
-    attn_weights = F.softmax(scores, dim=-1)
-    output = torch.zeros_like(q)
-    for h in range(head):
-        start_idx = h * head_dim
-        end_idx = start_idx + head_dim
-        v_h = v[:, :, start_idx:end_idx]
-        output[:, :, start_idx:end_idx] = torch.bmm(attn_weights[:, h], v_h)
-    return output
 class MultiheadA(nn.Module):
     _seen = set()
     rbf = False
@@ -578,20 +433,19 @@ class MultiheadA(nn.Module):
         rbf_scores = torch.exp(-dist_sq / (2 * rbf_sigma**2))
         return (1 - rbf_ratio) * dot_scores + rbf_ratio * rbf_scores
-    def forward(self, x: Tensor, xa: Tensor = None, mask: Tensor = None,
-                return_attn: bool = False, f0: Tensor = None, layer = None) -> tuple:
-        batch, ctx, dims = x.shape
         scale = (self.dims // self.head) ** -0.25
         z = xa if xa is not None else x
         q = self.q(x).to(x.dtype)
         k = self.k(z).to(x.dtype)
         v = self.v(z).to(x.dtype)
         if self.rotary_emb:
-            qf = self.rope(q.size(1), f0=f0, layer=layer)
-            kf = self.rope(k.size(1), f0=f0, layer=layer)
             q = q.view(*q.shape[:2], self.head, -1).permute(0, 2, 1, 3)
             k = k.view(*k.shape[:2], self.head, -1).permute(0, 2, 1, 3)
@@ -605,18 +459,13 @@ class MultiheadA(nn.Module):
             k = k.view(*k.shape[:2], self.head, -1).permute(0, 2, 1, 3)
             v = v.view(*v.shape[:2], self.head, -1).permute(0, 2, 1, 3)
             batch, head, ctx, head_dim = q.shape
-        if self.optim_attn and not return_attn:
-            wv = optim_attn(q * scale, k * scale, v, mask=mask,
-                pad_token=self.pad_token, fzero_val=torch.clamp(F.softplus(self.fzero), self.minz, self.maxz).item())
-            return self.o(wv), None
         if self.rbf:
             qk = self.enhanced_attention_scores(q * scale, k * scale, rbf_sigma=1.0, rbf_ratio=0.3)
         qk = (q * scale) @ (k * scale).transpose(-1, -2)
-        if f0 is not None and self.rope.use_pbias:
-            pbias = self.rope.pbias(f0)
             if pbias is not None:
                 qk = qk + pbias[:,:,:q.shape[2],:q.shape[2]]
         token_ids = k[:, :, :, 0]
@@ -628,8 +477,6 @@ class MultiheadA(nn.Module):
             mask = mask[:q.shape[2], :q.shape[2]]
             qk = qk + mask.unsqueeze(0).unsqueeze(0) * zscale.unsqueeze(-2).expand(qk.shape)
         qk = qk * zscale.unsqueeze(-2)
-        if return_attn:
-            return qk, v
         w = F.softmax(qk, dim=-1).to(q.dtype)
         wv = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
@@ -736,12 +583,12 @@ class Residual(nn.Module):
         if not any([t_gate, m_gate, c_gate]):
             self.mlp_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
-    def forward(self, x, xa=None, mask=None, f0=None, mode=None, layer=None):
         bln = self.blend
-        x = x + self.attna(self.lna(x), mask=mask, f0=f0, layer=layer)[0]
         if self.attnb and xa is not None:
-            c = self.attnb(self.lnb(x), xa, f0=f0, mask=None, layer=layer)[0]
             b = torch.sigmoid(bln)
             x = b * x + (1 - b) * c
@@ -756,7 +603,7 @@ class Residual(nn.Module):
             gate = self.m_gate(normx)
             x = x + gate * mlp_out
-        elif self.c_gate and mode is not None:
             gate_output = self.c_gate(normx, self.features)
             x = x + gate_output
@@ -796,7 +643,7 @@ class PEncoder(nn.Module):
             Conv1d(dims//4, dims//2, kernel_size=5, stride=4, padding=2), act_fn,
             Conv1d(dims//2, dims, kernel_size=5, stride=5, padding=2),act_fn)
-    def forward(self, x, f0=None, layer=None):
         x = self.encoder(x).permute(0, 2, 1)
         x = x + self.positional(x.shape[1]).to(x.device, x.dtype)
         x = nn.functional.dropout(x, p=self.dropout, training=self.training)
@@ -825,7 +672,7 @@ class WEncoder(nn.Module):
         self.positional = lambda length: sinusoids(length, dims)
         self.norm = RMSNorm(dims)
-    def forward(self, x, f0=None, layer=None):
         x = self.downsample(x)
         x = self.encoder(x)
         x = x.permute(0, 2, 1)
@@ -852,13 +699,49 @@ class FEncoder(nn.Module):
         self.norm = RMSNorm(dims)
         self._norm = RMSNorm(dims)
-    def forward(self, x, f0=None, layer=None):
         x = self.encoder(x).permute(0, 2, 1)
         x = x + self.positional(x.shape[1]).to(x.device, x.dtype)
         x = nn.functional.dropout(x, p=self.dropout, training=self.training)
         x = self._norm(x)
         return x
 class AudioEncoder(nn.Module):
     _seen = set()
     def __init__(self, mels: int, ctx: int, dims: int, head: int, layer: int, debug: List[str], features: List[str],
@@ -882,8 +765,7 @@ class AudioEncoder(nn.Module):
         self.f0_rotary = f0_rotary
         self.rope = rotary(
-            dims=self.head_dim,
-           )
         act_map = {"gelu": nn.GELU(), "relu": nn.ReLU(), "sigmoid": nn.Sigmoid(), "tanh": nn.Tanh(), "swish": nn.SiLU(),
         "tanhshrink": nn.Tanhshrink(), "softplus": nn.Softplus(), "softshrink": nn.Softshrink(), "leaky_relu": nn.LeakyReLU(), "elu": nn.ELU()}
@@ -920,35 +802,32 @@ class AudioEncoder(nn.Module):
             FEncoder(input_dims=1, dims=dims, head=head, layer=layer, kernel_size=9, act=act, stride=2)
             for _ in range(layer)])
-    def forward(self, x, f0=None, layer="encoder"):
         if self._counter < 1:
-            s = x.get("spectrogram")
-            w = x.get("waveform")
-            p = f0 if f0 is not None else x.get("pitch")
             plot_waveform(x=s, w=w, p=p, hop_length=128)
         enc = {}
-        # if f0 is not None:
-        #     f0 = self.f0(f0)
-                #self.rope(x=f, f0=f0, layer=layer)
-        for y in self.features:
-            if y in x and y in self.blocks:
-                f = x[y]
-                for block in self.blocks[y]:
-                    f = block(f, f0=f0, layer=layer)
-                enc[y] = f
         if "encoder" in self.debug and self._counter % 100 == 0:
-            names = list(x.keys())
-            shapes = {k: v.shape for k, v in x.items()}
             print(f"Step {self._counter}: mode: {names}")
             print(f"shapes: {shapes}")
             for name, param in self.named_parameters():
                 if param.requires_grad:
-                    print(f"🎛️ ENCODER LAYER {name}: grad_norm={param.median():.4f}")
         self._counter += 1
         return enc
@@ -992,8 +871,23 @@ class TextDecoder(nn.Module):
         mask = torch.tril(torch.ones(ctx, ctx), diagonal=0)
         self.register_buffer("mask", mask, persistent=False)
-    def forward(self, x, enc, order=None, layer='decoder') -> Tensor:
         bln = self.blend
         x = x.to(device)
         if order is None:
@@ -1001,13 +895,15 @@ class TextDecoder(nn.Module):
         mask = self.mask[:x.shape[1], :x.shape[1]]
         x = self.token(x) + self.positional[:x.shape[1]]
         x = F.dropout(x, p=self.dropout, training=self.training)
         for block in self.block:
-            x = block(x, xa=None, mask=mask, layer=layer)
         for f in order:
-            if f in enc:
-                xa = enc[f]
                 for block in self.blocks[f]:
-                    out = block(x=x, xa=xa, mask=None, layer=layer)
                 a = torch.sigmoid(bln[f])
                 x = a * out + (1 - a) * x
         x = self.ln_dec(x)
@@ -1015,9 +911,8 @@ class TextDecoder(nn.Module):
         if "decoder" in self.debug and self._counter % 100 == 0:
             for name, param in self.named_parameters():
                 if param.requires_grad:
-                    print(f"🎚️ DECODER LAYER {name}: grad_norm={param.median():.4f}")
         self._counter += 1
         return x @ torch.transpose(self.token.weight.to(dtype), 0, 1).float()
 class Echo(nn.Module):
@@ -1075,7 +970,7 @@ class Echo(nn.Module):
         spectrogram: torch.Tensor=None,
         pitch: Optional[torch.Tensor]=None,
         f0: Optional[torch.Tensor]=None,
-        # f0d: Optional[torch.Tensor]=None,
         envelope: Optional[torch.Tensor]=None,
         phase: Optional[torch.Tensor]=None,
         ) -> Dict[str, torch.Tensor]:
@@ -1092,10 +987,12 @@ class Echo(nn.Module):
             encoder_inputs["envelope"] = envelope
         if phase is not None:
             encoder_inputs["phase"] = phase
-        # if f0 is not None:
-        #     encoder_inputs["f0"] = f0
-        encoder_outputs = self.encoder(encoder_inputs, f0=f0, layer="encoder")
         logits = self.decoder(input_ids, encoder_outputs)
         loss = None
@@ -1167,7 +1064,7 @@ class Echo(nn.Module):
                 print(f"{module_type}: {count}")
     def register_gradient_hooks(self):
-        """Add this method to your Echo model class"""
         for name, param in self.named_parameters():
             if param.requires_grad:
                 if "encoder" in name:
@@ -1175,643 +1072,23 @@ class Echo(nn.Module):
                 elif "decoder" in name:
                     param.register_hook(lambda grad, n=name: self._print_decoder_grad(n, grad))
-        print("📊 Gradient debugging hooks registered")
         return self
     def _print_encoder_grad(self, name, grad):
         if grad is not None and self.count == 10:
             norm = grad.median().item()
-            print(f"🎛️ ENCODER GRAD: {name} = {norm:.6f}")
         return None
     def _print_decoder_grad(self, name, grad):
         if grad is not None and self.count == 10:
             norm = grad.median().item()
-            print(f"🎚️ DECODER GRAD: {name} = {norm:.6f}")
         return None
     def reset_counter(self):
-        """Reset the internal counter for debugging purposes."""
         self._counter = 0
         print("Counter reset to 0.")
-metric = evaluate.load(path="wer")
-def align_f0(f0, ctx):
-    ctx = torch.tensor(ctx)
-    bat, length = f0.shape
-    if length == ctx:
-        return f0
-    frames = length / ctx
-    idx = torch.arange(ctx, device=f0.device)
-    idx = (idx * frames).long()
-    batch_idx = torch.arange(bat, device=f0.device).unsqueeze(1)
-    return f0[batch_idx, idx.unsqueeze(0).expand(bat, -1)]
-@dataclass
-class DataCollator:
-    tokenizer: Any
-    def __call__(self, features: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
-        pad_token_id = tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else 0
-        bos_token_id = tokenizer.bos_token_id if hasattr(tokenizer, 'bos_token_id') else 1
-        batch = {}
-        if "spectrogram" in features[0] and features[0]["spectrogram"] is not None:
-            spectrogram_list = [f["spectrogram"] for f in features]
-            max_len_feat = max(f.shape[-1] for f in spectrogram_list)
-            pad_spectrogram = []
-            for feat in spectrogram_list:
-                current_len = feat.shape[-1]
-                padding = max_len_feat - current_len
-                if padding > 0:
-                    pad_feat = F.pad(feat, (0, padding), mode='constant', value=pad_token_id)
-                else:
-                    pad_feat = feat
-                pad_spectrogram.append(pad_feat)
-            batch["spectrogram"] = torch.stack(pad_spectrogram)
-        if "waveform" in features[0] and features[0]["waveform"] is not None:
-            waveform_list = [f["waveform"] for f in features]
-            max_len_wav = max(w.shape[-1] for w in waveform_list)
-            pad_waveforms = []
-            for wav in waveform_list:
-                current_len = wav.shape[-1]
-                padding = max_len_wav - current_len
-                if padding > 0:
-                    if wav.ndim == 1:
-                        wav = wav.unsqueeze(0)
-                    pad_wav = F.pad(wav, (0, padding), mode='constant', value=pad_token_id)
-                else:
-                    pad_wav = wav
-                pad_waveforms.append(pad_wav)
-            batch["waveform"] = torch.stack(pad_waveforms)
-        if "label" in features[0] and features[0]["label"] is not None:
-            labels_list = [f["label"] for f in features]
-            max_len = max(len(l) for l in labels_list)
-            all_ids = []
-            all_labels = []
-            for label in labels_list:
-                label_list = label.tolist() if isinstance(label, torch.Tensor) else label
-                decoder_input = [bos_token_id] + label_list
-                label_eos = label_list + [pad_token_id]
-                input_len = max_len + 1 - len(decoder_input)
-                label_len = max_len + 1 - len(label_eos)
-                padded_input = decoder_input + [pad_token_id] * input_len
-                padded_labels = label_eos + [pad_token_id] * label_len
-                all_ids.append(padded_input)
-                all_labels.append(padded_labels)
-            batch["input_ids"] = torch.tensor(all_ids, dtype=torch.long)
-            batch["labels"] = torch.tensor(all_labels, dtype=torch.long)
-        if "pitch" in features[0] and features[0]["pitch"] is not None:
-            pitch_list = [f["pitch"] for f in features]
-            max_len_pitch = max(e.shape[-1] for e in pitch_list)
-            pad_pitch = []
-            for pitch in pitch_list:
-                current_len = pitch.shape[-1]
-                padding = max_len_pitch - current_len
-                if padding > 0:
-                    pad_pitch_item = F.pad(pitch, (0, padding), mode='constant', value=pad_token_id)
-                else:
-                    pad_pitch_item = pitch
-                pad_pitch.append(pad_pitch_item)
-            batch["pitch"] = torch.stack(pad_pitch)
-        if "f0" in features[0] and features[0]["f0"] is not None:
-            all_f0 = torch.cat([f["f0"] for f in features])
-            batch["f0"] = all_f0.unsqueeze(0)
-        # if "f0" in features[0] and features[0]["f0"] is not None:
-        #     f0_labels = batch.get("labels", None)
-        #     aligned_features = []
-        #     for feature in features:
-        #         f0 = feature["f0"]
-        #         length = f0.shape
-        #         if length != f0_labels.shape[-1]:
-        #             ctx = f0_labels.shape[-1]
-        #             aligned_features.append(align_f0(f0.unsqueeze(0), ctx))
-        #         else:
-        #             aligned_features.append(f0)
-        #     all_aligned_f0 = torch.cat(aligned_features)
-        #     batch["f0d"] = all_aligned_f0
-        if "envelope" in features[0] and features[0]["envelope"] is not None:
-            env_list = [f["envelope"] for f in features]
-            max_len = max(f.shape[-1] for f in env_list)
-            pad_env = []
-            for feat in env_list:
-                current_len = feat.shape[-1]
-                padding = max_len_feat - current_len
-                if padding > 0:
-                    pad_feat = F.pad(feat, (0, padding), mode='constant', value=pad_token_id)
-                else:
-                    pad_feat = feat
-                pad_env.append(pad_feat)
-            batch["envelope"] = torch.stack(pad_env)
-        if "phase" in features[0] and features[0]["phase"] is not None:
-            ph_list = [f["phase"] for f in features]
-            max_len = max(f.shape[-1] for f in ph_list)
-            pad_ph = []
-            for feat in ph_list:
-                current_len = feat.shape[-1]
-                padding = max_len_feat - current_len
-                if padding > 0:
-                    pad_feat = F.pad(feat, (0, padding), mode='constant', value=pad_token_id)
-                else:
-                    pad_feat = feat
-                pad_ph.append(pad_feat)
-            batch["phase"] = torch.stack(pad_ph)
-        return batch
-def hilbert_transform(x):
-    N = x.shape[-1]
-    xf = torch.fft.rfft(x)
-    h = torch.zeros(N // 2 + 1, device=x.device, dtype=x.dtype)
-    if N % 2 == 0:
-        h[0] = h[N//2] = 1
-        h[1:N//2] = 2
-    else:
-        h[0] = 1
-        h[1:(N+1)//2] = 2
-    return torch.fft.irfft(xf * h, n=N)
-def analytic_signal(x):
-    return x + 1j * hilbert_transform(x)
-def hilbert_transform_2d(x, dim=-1):
-    N = x.shape[dim]
-    if dim == -1 or dim == len(x.shape) - 1:
-        xf = torch.fft.rfft(x)
-    else:
-        xf = torch.fft.rfft(x, dim=dim)
-    h_shape = [1] * len(x.shape)
-    h_shape[dim] = N // 2 + 1
-    h = torch.zeros(h_shape, device=x.device, dtype=x.dtype)
-    if dim == -1 or dim == len(x.shape) - 1:
-        if N % 2 == 0:
-            h[..., 0] = h[..., -1] = 1
-            h[..., 1:-1] = 2
-        else:
-            h[..., 0] = 1
-            h[..., 1:] = 2
-    else:
-        pass
-    return torch.fft.irfft(xf * h, n=N, dim=dim)
-def hilbert_transform_true_2d(x):
-    xf = torch.fft.rfft2(x)
-    h1, h2 = torch.meshgrid(
-        torch.fft.rfftfreq(x.shape[-2]) * 2 - 1,
-        torch.fft.rfftfreq(x.shape[-1]) * 2 - 1,
-        indexing='ij')
-    h = -1j / (math.pi * (h1 + 1j*h2))
-    h[0, 0] = 0
-    return torch.fft.irfft2(xf * h.to(x.device))
-def process_spectrogram_with_hilbert(spec):
-    analytic = spec + 1j * hilbert_transform(spec)
-    envelope = torch.abs(analytic)
-    phase = torch.angle(analytic)
-    return envelope, phase
-def load_wave(wave_data, sample_rate):
-    if isinstance(wave_data, str):
-        waveform, sr = torchaudio.load(uri=wave_data, normalize=False)
-    elif isinstance(wave_data, dict):
-        waveform = torch.tensor(data=wave_data["array"]).float()
-        sr = wave_data["sampling_rate"]
-    else:
-        raise TypeError("Invalid wave_data format.")
-    if waveform.dim() == 1:
-        waveform = waveform.unsqueeze(0)
-    if sr != sample_rate:
-        original_length = waveform.shape[1]
-        target_length = int(original_length * (sample_rate / sr))
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
-        waveform = resampler(waveform)
-    return waveform.flatten()
-def extract_features(batch, tokenizer, spectrogram, waveforms, pitch, frequency=False,
-                     hop_length=128, fmin=0, fmax=8000, n_mels=128, n_fft=1024, sampling_rate=16000,
-                     pad_mode="constant", center=True, power=2.0, window_fn=torch.hann_window, mel_scale="htk",
-                     norm=None, normalized=False, downsamples=False, period=False, hilbert=False):
-    dtype = torch.float32
-    device = torch.device("cuda:0")
-    audio = batch["audio"]
-    sampling_rate = audio["sampling_rate"]
-    sr = audio["sampling_rate"]
-    wav = load_wave(wave_data=audio, sample_rate=sr)
-    if spectrogram:
-        transform = torchaudio.transforms.MelSpectrogram(
-            f_max=fmax,
-            f_min=fmin,
-            n_mels=n_mels,
-            sample_rate=sr,
-            n_fft=n_fft,
-            hop_length=hop_length,
-            norm=norm,
-            normalized=normalized,
-            power=power,
-            center=center,
-            mel_scale=mel_scale,
-            window_fn=window_fn,
-            pad_mode=pad_mode)
-        mel_spectrogram = transform(wav)
-        log_mel = torch.clamp(mel_spectrogram, min=1e-10).log10()
-        log_mel = torch.maximum(log_mel, log_mel.max() - 8.0)
-        spec = (log_mel + 4.0) / 4.0
-        spec = torch.tensor(spec)
-        batch["spectrogram"] = spec
-    if hilbert:
-        envelope_list = []
-        phase_list = []
-        for ch_idx in range(spec.shape[0]):
-            envelope, phase = process_spectrogram_with_hilbert(spec[ch_idx])
-            envelope_list.append(envelope)
-            phase_list.append(phase)
-        batch["envelope"] = torch.stack(envelope_list)
-        batch["phase"] = torch.stack(phase_list)
-    wav_1d = wav.unsqueeze(0)
-    if waveforms:
-        batch["waveform"] = wav_1d
-    if pitch:
-        wav_np = wav.numpy().astype(np.float64)
-        f0, t = pw.dio(wav_np, sampling_rate,
-                    frame_period=hop_length/sampling_rate*1000)
-        f0 = pw.stonemask(wav_np, f0, t, sampling_rate)
-        f0 = torch.from_numpy(f0).float()
-        batch["pitch"] = f0.unsqueeze(0)
-    if frequency:
-        wav_np = wav.numpy().astype(np.float64)
-        f0, t = pw.dio(wav_np, sampling_rate,
-                    frame_period=hop_length/sampling_rate*1000)
-        f0 = pw.stonemask(wav_np, f0, t, sampling_rate)
-        f0 = f0
-        batch["f0"] = torch.from_numpy(f0).float()
-    if spectrogram and waveforms and pitch:
-        spec_mean = batch["spectrogram"].mean()
-        spec_std = batch["spectrogram"].std() + 1e-6
-        batch["spectrogram"] = (batch["spectrogram"] - spec_mean) / spec_std
-        wav_mean = batch["waveform"].mean()
-        wav_std = batch["waveform"].std() + 1e-6
-        batch["waveform"] = (batch["waveform"] - wav_mean) / wav_std
-        if batch["pitch"].max() > 1.0:
-            pitch_min = 50.0
-            pitch_max = 600.0
-            batch["pitch"] = (batch["pitch"] - pitch_min) / (pitch_max - pitch_min)
-    batch["label"] = tokenizer.encode(batch["transcription"], add_special_tokens=False)
-    return batch
-def compute_metrics(eval_pred, compute_result: bool = True,
-                    print_pred: bool = False, num_samples: int = 0, tokenizer=None, pitch=None, model=None):
-    pred_logits = eval_pred.predictions
-    label_ids = eval_pred.label_ids
-    if hasattr(pred_logits, "cpu"):
-        pred_logits = pred_logits.cpu()
-    if hasattr(label_ids, "cpu"):
-        label_ids = label_ids.cpu()
-    if isinstance(pred_logits, tuple):
-        pred_ids = pred_logits[0]
-    else:
-        pred_ids = pred_logits
-    if hasattr(pred_ids, "ndim") and pred_ids.ndim == 3:
-        if not isinstance(pred_ids, torch.Tensor):
-            pred_ids = torch.tensor(pred_ids)
-        pred_ids = pred_ids.argmax(dim=-1)
-        pred_ids = pred_ids.tolist()
-    if hasattr(label_ids, "tolist"):
-        label_ids = label_ids.tolist()
-    label_ids = [[0 if token == -100 else token for token in seq] for seq in label_ids]
-    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=False)
-    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=False)
-    if print_pred:
-        for i in range(min(num_samples, len(pred_str))):
-            print(f"Preds: {pred_str[i]}")
-            print(f"Label: {label_str[i]}")
-            print(f"preds: {pred_ids[i]}")
-            print(f"label: {label_ids[i]}")
-            print("--------------------------------")
-    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
-    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
-    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
-    if model is None:
-        global global_model
-        if 'global_model' in globals():
-            model = global_model
-    if model is not None:
-        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1_000_000
-        if trainable_params > 0:
-            efficiency_score = (100 - wer) / trainable_params
-        else:
-            print("Warning: Zero trainable parameters detected")
-            efficiency_score = 0.0
-    else:
-        print("Warning: Model not available for parameter counting")
-        trainable_params = 0.0
-        efficiency_score = 0.0
-    if hasattr(wer, "item"):
-        wer = wer.item()
-    metrics = {
-        "wer": float(wer),
-        "trainable_params_M": float(trainable_params),
-        "efficiency_score": float(efficiency_score),
-    }
-    return metrics
-logger = logging.getLogger(__name__)
-def create_model(param: Dimensions) -> Echo:
-    model = Echo(param).to('cuda')
-    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    total_params = sum(p.numel() for p in model.parameters())
-    logger.info(f"Trainable parameters: {trainable_params:,}")
-    logger.info(f"Total parameters: {total_params:,}")
-    print(f"Trainable parameters: {trainable_params:,}")
-    print(f"Total parameters: {total_params:,}")
-    return model
-def setup_tokenizer(token: str, local_tokenizer_path: str = "D:/newmodel/model/tokenn/"):
-    from tokenizers import Tokenizer
-    tokenizer = Tokenizer.from_file(f"{local_tokenizer_path}/tokenizer.json")
-    orig_encode = tokenizer.encode
-    def enc(text, add_special_tokens=True):
-        ids = orig_encode(text).ids
-        if not add_special_tokens:
-            sp_ids = [tokenizer.token_to_id(t) for t in ["<PAD>", "<BOS>", "<EOS>"]]
-            ids = [id for id in ids if id not in sp_ids]
-        return ids
-    def bdec(ids_list, skip_special_tokens=True):
-        results = []
-        for ids in ids_list:
-            if skip_special_tokens:
-                ids = [id for id in ids if id not in [0, 1, 2]]
-            results.append(tokenizer.decode(ids))
-        return results
-    def save_pretrained(save_dir):
-        os.makedirs(save_dir, exist_ok=True)
-        tokenizer.save(f"{save_dir}/tokenizer.json")
-    tokenizer.encode = enc
-    tokenizer.batch_decode = bdec
-    tokenizer.save_pretrained = save_pretrained
-    tokenizer.pad_token_id = 0
-    tokenizer.bos_token_id = 1
-    tokenizer.eos_token_id = 2
-    return tokenizer
-def prepare_datasets(tokenizer, token: str, sanity_check: bool = False, dataset_config: Optional[Dict] = None) -> Tuple[any, any]:
-    if dataset_config is None:
-        dataset_config = {
-            "spectrogram": True,
-            "waveforms": True,
-            "pitch": True,
-            "frequency": True,
-            "downsamples": True,
-            "hop_length": 128,
-            "fmin": 50,
-            "fmax": 2000,
-            "n_mels": 128,
-            "n_fft": 1024,
-            "sampling_rate": 16000,
-        }
-    dataset = load_dataset(
-        "google/fleurs",
-        "en_us",
-        token=token,
-        trust_remote_code=True,
-        streaming=False)
-    dataset = dataset.cast_column(column="audio", feature=Audio(sampling_rate=16000)).select_columns(["audio", "transcription"])
-    if sanity_check:
-        dataset = dataset["test"].take(10)
-        dataset = dataset.select_columns(["audio", "transcription"])
-        logger.info(f"Sanity dataset size: {dataset.num_rows}")
-        print(f"Sanity dataset size: {dataset.num_rows}")
-        prepare_fn = partial(extract_features, tokenizer=tokenizer, **dataset_config)
-        dataset = dataset.map(
-            function=prepare_fn,
-            remove_columns=["audio", "transcription"]
-        ).with_format(type="torch")
-        train_dataset = dataset
-        test_dataset = dataset
-    else:
-        def filter_func(x):
-            return (0 < len(x["transcription"]) < 512 and
-                   len(x["audio"]["array"]) > 0 and
-                   len(x["audio"]["array"]) < 1500 * 160)
-        dataset = dataset.filter(filter_func).shuffle(seed=4)
-        logger.info(f"Dataset size: {dataset['train'].num_rows}, {dataset['test'].num_rows}")
-        print(f"Dataset size: {dataset['train'].num_rows}, {dataset['test'].num_rows}")
-        prepare_fn = partial(extract_features, tokenizer=tokenizer, **dataset_config)
-        columns_to_remove = list(next(iter(dataset.values())).features)
-        train_dataset = dataset["train"]
-        test_dataset = dataset["test"].take(50)
-        logger.info(f"Train dataset size: {train_dataset.num_rows}, Test dataset size: {test_dataset.num_rows}")
-        train_dataset = train_dataset.map(
-            function=prepare_fn,
-            remove_columns=columns_to_remove
-        ).with_format(type="torch")
-        test_dataset = test_dataset.map(
-            function=prepare_fn,
-            remove_columns=columns_to_remove
-        ).with_format(type="torch")
-    return train_dataset, test_dataset
-def get_training_args(
-    log_dir: str,
-    batch_eval_metrics: bool = False,
-    max_steps: int = 10,
-    save_steps: int = 1000,
-    eval_steps: int = 1,
-    warmup_steps: int = 0,
-    num_train_epochs: int = 1,
-    logging_steps: int = 1,
-    eval_on_start: bool = False,
-    learning_rate: float = 1e-4,
-    weight_decay: float = 0.01,
-    max_grad_norm: float = 1.0,
-) -> Seq2SeqTrainingArguments:
-    return Seq2SeqTrainingArguments(
-        output_dir=log_dir,
-        per_device_train_batch_size=1,
-        per_device_eval_batch_size=1,
-        gradient_accumulation_steps=1,
-        eval_accumulation_steps=1,
-        tf32=True,
-        bf16=True,
-        eval_strategy="steps",
-        save_strategy="steps",
-        max_steps=max_steps,
-        save_steps=save_steps,
-        eval_steps=eval_steps,
-        warmup_steps=warmup_steps,
-        num_train_epochs=num_train_epochs,
-        logging_steps=logging_steps,
-        logging_dir=log_dir,
-        logging_strategy="steps",
-        report_to=["tensorboard"],
-        push_to_hub=False,
-        disable_tqdm=False,
-        save_total_limit=1,
-        label_names=["labels"],
-        optim="adamw_torch",
-        lr_scheduler_type="cosine",
-        learning_rate=learning_rate,
-        weight_decay=weight_decay,
-        save_safetensors=False,
-        eval_on_start=eval_on_start,
-        batch_eval_metrics=batch_eval_metrics,
-        max_grad_norm=max_grad_norm,
-    )
-def main():
-    token = ""
-    log_dir = os.path.join('./output/logs', datetime.now().strftime(format='%m-%d_%H'))
-    os.makedirs(name=log_dir, exist_ok=True)
-    tokenizer = setup_tokenizer(token)
-    def sanity(sanity: bool):
-        if sanity:
-            training_args = get_training_args(
-            log_dir,
-            batch_eval_metrics = False,
-            max_steps = 10,
-            save_steps = 0,
-            eval_steps = 1,
-            warmup_steps = 0,
-            logging_steps = 1,
-            eval_on_start = False,
-            learning_rate = 5e-6,
-            weight_decay = 0.01,
-            )
-        else:
-            training_args = get_training_args(
-            log_dir,
-            batch_eval_metrics = False,
-            max_steps = 1000,
-            save_steps = 1000,
-            eval_steps = 100,
-            warmup_steps = 100,
-            logging_steps = 10,
-            eval_on_start = False,
-            learning_rate = 2.5e-4,
-            weight_decay = 0.01,
-            )
-        return training_args
-    param = Dimensions(
-        mels=128,
-        aud_ctx=1500,
-        aud_head=4,
-        aud_dims=512,
-        aud_idx=4,
-        vocab=40000,
-        text_ctx=512,
-        text_head=4,
-        text_dims=512,
-        text_idx=4,
-        act="swish",
-        debug={},#{"encoder", "decoder", "residual", "rotary"},
-        cross_attn=True,
-        f0_rotary=False,
-        features = ["spectrogram"]#, "waveform", "pitch", "f0", "envelope", "phase"],
-        )
-    sanity_check = False
-    training_args = sanity(sanity_check)
-    dataset_config = {
-        "spectrogram": True,
-        "waveforms": False,
-        "pitch": False,
-        "downsamples": False,
-        "frequency": False,
-        "hilbert": False,
-        "hop_length": 128,
-        "fmin": 150,
-        "fmax": 2000,
-        "n_mels": 128,
-        "n_fft": 1024,
-        "sampling_rate": 16000,
-        "pad_mode": "constant",
-        "center": True,
-        "power": 2.0,
-        "window_fn": torch.hann_window,
-        "mel_scale": "htk",
-        "norm": None,
-        "normalized": False}
-    model = create_model(param)
-    global global_model
-    global_model = model
-    metrics_fn = partial(compute_metrics, print_pred=False, num_samples=5,
-                    tokenizer=tokenizer, model=model)
-    print(f"{'Sanity check' if sanity_check else 'Training'} mode")
-    train_dataset, test_dataset = prepare_datasets(
-        tokenizer=tokenizer,
-        token=token,
-        sanity_check=sanity_check,
-        dataset_config=dataset_config)
-    trainer = Seq2SeqTrainer(
-        args=training_args,
-        model=model,
-        train_dataset=train_dataset,
-        eval_dataset=test_dataset,
-        data_collator=DataCollator(tokenizer=tokenizer),
-        compute_metrics=metrics_fn,
-        )
-    model.init_weights()
-    trainer.train()
-if __name__ == "__main__":
-    main()

 import pyworld as pw
 import os
 import math, random
 import logging
 import gzip
 import base64
 import torch
 import torchaudio
 import torch.nn.functional as F
 import torch.nn.init as init
 from torch import nn, Tensor
 import transformers
 import evaluate
 from dataclasses import dataclass
+import matplotlib.pyplot as plt
 device = torch.device(device="cuda:0")
 dtype = torch.float32
 extractor = None
 tokenizer = None
 optimizer = None
     features: List[str]
     f0_rotary: bool
 def plot_waveform(x=None, w=None, p=None, per=None, sample_idx=0, sr=16000, hop_length=160,
                                  title="", markers=None, marker_labels=None,
                                  show_voiced_regions=True, show_energy=False):
         axs[current_ax].set_ylabel("Mel Bin")
         axs[current_ax].set_xlim([0, max_time])
         axs[current_ax].grid(True, axis='x', linestyle='--', alpha=0.3)
         current_ax += 1
     if p is not None:
             param.requires_grad = (x == self.current_idx)
             print(f"Parameter {x}: requires_grad={param.requires_grad}")
         self.current_idx = (self.current_idx + 1) % len(self.parameters)
+def extract_f0(waveform, sampling_rate=16000, hop_length=128, device="cuda:0"):
+    """Extract F0 from waveform - handle various input types"""
+    if waveform is None:
+        return None
+    if isinstance(waveform, list):
+        if len(waveform) == 0:
+            return None
+        waveform = waveform[0]
+        print(f"DEBUG: Converted list to tensor, new type: {type(waveform)}")
+    if not isinstance(waveform, torch.Tensor):
+        waveform = torch.tensor(waveform)
+    if isinstance(waveform, torch.Tensor):
+        if waveform.dim() == 3:
+            waveform = waveform.squeeze(1)
+        if waveform.dim() == 2:
+            waveform = waveform[0]
+        wav_np = waveform.detach().cpu().numpy().astype(np.float64)
+    else:
+        wav_np = np.array(waveform).astype(np.float64)
+    f0, t = pw.dio(wav_np, sampling_rate,
+                  frame_period=hop_length/sampling_rate*1000)
+    f0 = pw.stonemask(wav_np, f0, t, sampling_rate)
+    f0_tensor = torch.from_numpy(f0).float().to(device)
+    return f0_tensor.unsqueeze(0).unsqueeze(0)
 class rotary(nn.Module):
     _seen = set()
                  learned_radius=False, learned_theta=False, learned_pitch=False, debug: List[str] = [], use_pbias = False):
         super().__init__()
+        self.dims = dims
         self.use_pbias = use_pbias
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.dtype = torch.float32
         self.debug = debug
         self._counter = 0
         self.max_ctx = max_ctx
         self.radii = radii
         f0_factor = 0.5
+        self.adaptation: bool = False
         pitch_scale = 1.0
         radius = 1
+        if self.adaptation:
             self.f0_scale = nn.Parameter(torch.tensor(f0_factor, device=self.device, dtype=self.dtype), requires_grad=True)
         else:
             self.register_buffer('f0_scale', torch.tensor(f0_factor))
         self.theta = nn.Parameter(torch.tensor(theta, device=self.device, dtype=self.dtype), requires_grad=True)
         self.pitch_scale = nn.Parameter(torch.tensor(pitch_scale, device=self.device, dtype=self.dtype), requires_grad=True)
+        freqs = 1. / (theta ** (torch.arange(0, dims, 2, device=self.device, dtype=self.dtype)[:(dims // 2)].float() / dims))
         self.freqs = nn.Parameter(torch.tensor(freqs, device=self.device, dtype=self.dtype), requires_grad=True)
         self.radius = nn.Parameter(torch.ones(radius, device=self.device, dtype=self.dtype), requires_grad=True)
+    def forward(self, x=None, feat=None, layer=None) -> Tensor:
+        f0 = feat.get("f0") if feat else None
         if isinstance(x, int):
             ctx = x
         else:
             batch, ctx, dims = x.shape
         t = torch.arange(ctx, device=self.device).float()
         if f0 is not None:
             f0_mean=f0.mean()+1e-8
+            theta=f0_mean*self.pitch_scale
+            freqs = 1. / (theta ** (torch.arange(0, self.dims, 2, device=self.device, dtype=self.dtype)[:(self.dims // 2)].float() /self.dims))
         else:
             freqs = self.freqs
         freqs = torch.einsum('i,j->ij', t, freqs)
         freqs = freqs.float()
+        if self.radii:
+            radius = feat.get("f0d") if feat else self.radius
             radius = radius.float()
         else:
+            radius = self.radius
+            freqs = torch.polar(radius.unsqueeze(-1), freqs)  # freqs = torch.polar(torch.ones_like(freqs), freqs.unsqueeze(0))
         if "rotary" in self.debug:
             if f0 is not None:
                 key = f"{self._counter}_{theta:.2f}"
                 if key not in rotary._seen:
                     if not hasattr(self, '_prev_f0_theta'):
                         self._prev_f0_theta = theta
                     elif abs(self._prev_f0_theta - theta) > 100.0:
+                        print(f"{layer} : {f0_mean} : Theta: {theta:.2f} : {theta:.2f} : {ctx} ")
+                        if self.radii:
+                            print(f"radius: {radius} Hz, enc: {layer} Hz, ctx: {ctx}")
                         self._prev_f0_theta = theta
                     rotary._seen.add(key)
             self._counter += 1
                 x1 = torch.view_as_real(x1).flatten(-2)
                 return torch.cat([x1.type_as(x), x2], dim=-1)
 class MultiheadA(nn.Module):
     _seen = set()
     rbf = False
         rbf_scores = torch.exp(-dist_sq / (2 * rbf_sigma**2))
         return (1 - rbf_ratio) * dot_scores + rbf_ratio * rbf_scores
+    def forward(self, x: Tensor, xa: Tensor = None, mask: Tensor = None, feat=None, layer = None) -> tuple:
         scale = (self.dims // self.head) ** -0.25
         z = xa if xa is not None else x
         q = self.q(x).to(x.dtype)
         k = self.k(z).to(x.dtype)
         v = self.v(z).to(x.dtype)
+        batch, ctx, dims = q.shape
         if self.rotary_emb:
+            qf = self.rope(q.size(1), layer=layer, feat=feat)
+            kf = self.rope(k.size(1), layer=layer, feat=feat)
             q = q.view(*q.shape[:2], self.head, -1).permute(0, 2, 1, 3)
             k = k.view(*k.shape[:2], self.head, -1).permute(0, 2, 1, 3)
             k = k.view(*k.shape[:2], self.head, -1).permute(0, 2, 1, 3)
             v = v.view(*v.shape[:2], self.head, -1).permute(0, 2, 1, 3)
             batch, head, ctx, head_dim = q.shape
         if self.rbf:
             qk = self.enhanced_attention_scores(q * scale, k * scale, rbf_sigma=1.0, rbf_ratio=0.3)
         qk = (q * scale) @ (k * scale).transpose(-1, -2)
+        if self.rope.use_pbias:
+            pbias = self.rope.pbias(feat.get("f0"))
             if pbias is not None:
                 qk = qk + pbias[:,:,:q.shape[2],:q.shape[2]]
         token_ids = k[:, :, :, 0]
             mask = mask[:q.shape[2], :q.shape[2]]
             qk = qk + mask.unsqueeze(0).unsqueeze(0) * zscale.unsqueeze(-2).expand(qk.shape)
         qk = qk * zscale.unsqueeze(-2)
         w = F.softmax(qk, dim=-1).to(q.dtype)
         wv = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
         if not any([t_gate, m_gate, c_gate]):
             self.mlp_gate = nn.Sequential(Linear(dims, 1), nn.Sigmoid())
+    def forward(self, x: Tensor, xa: Tensor = None, mask: Tensor = None, feat=None, layer = None):
         bln = self.blend
+        x = x + self.attna(self.lna(x), xa=None, mask=mask, layer=layer, feat=feat)[0]
         if self.attnb and xa is not None:
+            c = self.attnb(self.lnb(x), xa, mask=None, layer=layer, feat=feat)[0]
             b = torch.sigmoid(bln)
             x = b * x + (1 - b) * c
             gate = self.m_gate(normx)
             x = x + gate * mlp_out
+        elif self.c_gate is not None:
             gate_output = self.c_gate(normx, self.features)
             x = x + gate_output
             Conv1d(dims//4, dims//2, kernel_size=5, stride=4, padding=2), act_fn,
             Conv1d(dims//2, dims, kernel_size=5, stride=5, padding=2),act_fn)
+    def forward(self, x, feat=None, layer=None):
         x = self.encoder(x).permute(0, 2, 1)
         x = x + self.positional(x.shape[1]).to(x.device, x.dtype)
         x = nn.functional.dropout(x, p=self.dropout, training=self.training)
         self.positional = lambda length: sinusoids(length, dims)
         self.norm = RMSNorm(dims)
+    def forward(self, x, feat=None, layer=None):
         x = self.downsample(x)
         x = self.encoder(x)
         x = x.permute(0, 2, 1)
         self.norm = RMSNorm(dims)
         self._norm = RMSNorm(dims)
+    def forward(self, x, feat=None, layer=None):
         x = self.encoder(x).permute(0, 2, 1)
         x = x + self.positional(x.shape[1]).to(x.device, x.dtype)
         x = nn.functional.dropout(x, p=self.dropout, training=self.training)
         x = self._norm(x)
         return x
+class F0Encoder(nn.Module):
+    def __init__(self, input_dims, dims, head, layer, kernel_size, act, stride=1):
+        super().__init__()
+        self.head_dim = dims // head
+        self.dropout = 0.01
+        act_map = {"gelu": nn.GELU(), "relu": nn.ReLU(), "sigmoid": nn.Sigmoid(),
+                   "tanh": nn.Tanh(), "swish": nn.SiLU(), "tanhshrink": nn.Tanhshrink(),
+                   "softplus": nn.Softplus(), "softshrink": nn.Softshrink(),
+                   "leaky_relu": nn.LeakyReLU(), "elu": nn.ELU()}
+        act_fn = act_map.get(act, nn.GELU())
+        self.encoder = nn.Sequential(
+            Conv1d(input_dims, dims, kernel_size=kernel_size, stride=stride, padding=kernel_size//2), act_fn,
+            Conv1d(dims, dims, kernel_size=5, padding=2), act_fn,
+            Conv1d(dims, dims, kernel_size=3, padding=1, groups=dims), act_fn)
+        self.positional = lambda length: sinusoids(length, dims)
+        self.norm = RMSNorm(dims)
+        self._norm = RMSNorm(dims)
+    def forward(self, x, feat=None, layer=None):
+        if x.dim() == 3 and x.shape[0] == 1 and x.shape[1] == 1:
+            pass
+        elif x.dim() == 2:
+            x = x.unsqueeze(1)
+        elif x.dim() == 1:
+            x = x.unsqueeze(0).unsqueeze(0)
+        x = self.encoder(x)
+        x = x.permute(0, 2, 1)
+        x = x + self.positional(x.shape[1]).to(x.device, x.dtype)
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        x = self._norm(x)
+        return x
 class AudioEncoder(nn.Module):
     _seen = set()
     def __init__(self, mels: int, ctx: int, dims: int, head: int, layer: int, debug: List[str], features: List[str],
         self.f0_rotary = f0_rotary
         self.rope = rotary(
+            dims=self.head_dim)
         act_map = {"gelu": nn.GELU(), "relu": nn.ReLU(), "sigmoid": nn.Sigmoid(), "tanh": nn.Tanh(), "swish": nn.SiLU(),
         "tanhshrink": nn.Tanhshrink(), "softplus": nn.Softplus(), "softshrink": nn.Softshrink(), "leaky_relu": nn.LeakyReLU(), "elu": nn.ELU()}
             FEncoder(input_dims=1, dims=dims, head=head, layer=layer, kernel_size=9, act=act, stride=2)
             for _ in range(layer)])
+    def forward(self, feat, layer="encoder"):
         if self._counter < 1:
+            s = feat.get("spectrogram")
+            w = feat.get("waveform")
+            p = default(feat.get("f0"), feat.get("pitch"))
             plot_waveform(x=s, w=w, p=p, hop_length=128)
         enc = {}
+        enc.update(feat)
+        for f in self.features:
+            if f in feat and f in self.blocks:
+                x = feat[f]
+                for block in self.blocks[f]:
+                    x = block(x, feat=feat, layer=layer)
+                enc[f] = x
         if "encoder" in self.debug and self._counter % 100 == 0:
+            names = list(feat.keys())
+            shapes = {k: v.shape for k, v in feat.items()}
             print(f"Step {self._counter}: mode: {names}")
             print(f"shapes: {shapes}")
             for name, param in self.named_parameters():
                 if param.requires_grad:
+                    print(f"ENCODER LAYER {name}: grad_norm={param.median():.4f}")
         self._counter += 1
         return enc
         mask = torch.tril(torch.ones(ctx, ctx), diagonal=0)
         self.register_buffer("mask", mask, persistent=False)
+        rotary_emb = False
+        if rotary_emb:
+            self.rope = rotary(
+                dims=self.head_dim,
+                debug = debug,
+                radii=False,
+                learned_pitch=False,
+                learned_freq=False,
+                learned_theta=False,
+                learned_radius=False,
+                )
+        else:
+            self.rope = None
+    def forward(self, x, feat, order=None, layer='decoder') -> Tensor:
         bln = self.blend
         x = x.to(device)
         if order is None:
         mask = self.mask[:x.shape[1], :x.shape[1]]
         x = self.token(x) + self.positional[:x.shape[1]]
         x = F.dropout(x, p=self.dropout, training=self.training)
         for block in self.block:
+            x = block(x, xa=None, mask=mask, feat=feat, layer=layer)
         for f in order:
+            if f in feat:
+                xa = feat[f]
                 for block in self.blocks[f]:
+                    out = block(x=x, xa=xa, mask=None, feat=feat, layer=layer)
                 a = torch.sigmoid(bln[f])
                 x = a * out + (1 - a) * x
         x = self.ln_dec(x)
         if "decoder" in self.debug and self._counter % 100 == 0:
             for name, param in self.named_parameters():
                 if param.requires_grad:
+                    print(f"DECODER LAYER {name}: grad_norm={param.median():.4f}")
         self._counter += 1
         return x @ torch.transpose(self.token.weight.to(dtype), 0, 1).float()
 class Echo(nn.Module):
         spectrogram: torch.Tensor=None,
         pitch: Optional[torch.Tensor]=None,
         f0: Optional[torch.Tensor]=None,
+        f0d: Optional[torch.Tensor]=None,
         envelope: Optional[torch.Tensor]=None,
         phase: Optional[torch.Tensor]=None,
         ) -> Dict[str, torch.Tensor]:
             encoder_inputs["envelope"] = envelope
         if phase is not None:
             encoder_inputs["phase"] = phase
+        if f0 is not None:
+            encoder_inputs["f0"] = f0
+        if f0d is not None:
+            encoder_inputs["f0d"] = f0d
+        encoder_outputs = self.encoder(encoder_inputs)
         logits = self.decoder(input_ids, encoder_outputs)
         loss = None
                 print(f"{module_type}: {count}")
     def register_gradient_hooks(self):
         for name, param in self.named_parameters():
             if param.requires_grad:
                 if "encoder" in name:
                 elif "decoder" in name:
                     param.register_hook(lambda grad, n=name: self._print_decoder_grad(n, grad))
+        print("Gradient debugging hooks registered")
         return self
     def _print_encoder_grad(self, name, grad):
         if grad is not None and self.count == 10:
             norm = grad.median().item()
+            print(f"ENCODER GRAD: {name} = {norm:.6f}")
         return None
     def _print_decoder_grad(self, name, grad):
         if grad is not None and self.count == 10:
             norm = grad.median().item()
+            print(f"DECODER GRAD: {name} = {norm:.6f}")
         return None
     def reset_counter(self):
         self._counter = 0
         print("Counter reset to 0.")