Sin2pi
/

asr-model

@@ -7,15 +7,29 @@ from torch import nn, Tensor, einsum
 import numpy as np
 from dataclasses import dataclass
 from einops import rearrange
-from datetime import datetime
-from echoutils import *
-from transformers.trainer_seq2seq import Seq2SeqTrainer
-from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
 logging.basicConfig(level=logging.ERROR)
 def sinusoids(ctx, dims, max_tscale=10000):
     assert dims % 2 == 0
     pos = torch.log(torch.tensor(float(max_tscale))) / (dims // 2 - 1)
@@ -40,52 +54,159 @@ def get_activation(act: str) -> nn.Module:
     }
     return act_map.get(act, nn.GELU())
-def there_is_a(val):
-    return val is not None
 @dataclass
 class Dimensions:
-    vocab: int
     mels: int
     ctx: int
     dims: int
     head: int
     layer: int
     act: str
 class rotary(nn.Module):
     def __init__(self, dims, head):
         super(rotary, self).__init__()
         self.dims = dims
         self.head = head
         self.head_dim = dims // head
-        self.theta = nn.Parameter((torch.tensor(16000, device=device, dtype=dtype)), requires_grad=True)
         self.register_buffer('freqs_base', self._compute_freqs_base(), persistent=False)
     def _compute_freqs_base(self):
         mel_scale = torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1
         return 200 * mel_scale / 1000
-    def forward(self, x) -> Tensor:
-        freqs = (self.theta / 220.0) * self.freqs_base
-        pos = torch.arange(x.shape[2], device=device, dtype=dtype)
-        freqs = pos[:, None] * freqs
-        freqs = torch.polar(torch.ones_like(freqs), freqs)
-        x1 = x[..., :freqs.shape[-1]*2]
-        x2 = x[..., freqs.shape[-1]*2:]
-        orig_shape = x1.shape
-        x1 = x1.float().reshape(*x1.shape[:-1], -1, 2).contiguous()
-        x1 = torch.view_as_complex(x1) * freqs
-        x1 = torch.view_as_real(x1).flatten(-2)
-        x1 = x1.view(orig_shape)
-        return torch.cat([x1.type_as(x), x2], dim=-1)
 class attentiona(nn.Module):
     def __init__(self, dims: int, head: int):
         super().__init__()
         self.head = head
         self.dims = dims
         self.head_dim = dims // head
@@ -95,25 +216,23 @@ class attentiona(nn.Module):
         self.zmax = 1e-5
         self.zero = nn.Parameter(torch.tensor(1e-4, device=device, dtype=dtype), requires_grad=False)
-        self.q = nn.Linear(dims, dims, bias=False)
         self.kv = nn.Linear(dims, dims * 2, bias=False)
-        self.out = nn.Linear(dims, dims, bias=False)
         self.lna = nn.LayerNorm(dims)
         self.rope = rotary(dims, head)
-    def forward(self, x, xa = None, mask = None):
         zero = self.zero
-        q = self.q(self.lna(x))
         k, v = self.kv(self.lna(x if xa is None else xa)).chunk(2, dim=-1)
         q, k, v = map(lambda t: rearrange(t, 'b c (h d) -> b h c d', h = self.head), (q, k, v))
         scale = q.shape[-1] ** -0.5
-        q = self.rope(q)
-        k = self.rope(k)
-        qk = einsum('b h k d, b h q d -> b h k q', q, k) * scale
         scale = torch.ones_like(k[:, :, :, 0])
         zero = torch.clamp(F.softplus(zero), 1e-6, 1e-5)
@@ -134,7 +253,7 @@ class attentiona(nn.Module):
         return out
 class tgate(nn.Module):
-    def __init__(self, dims, num_types=4):
         super().__init__()
         self.gates = nn.ModuleList([nn.Sequential(nn.Linear(dims, dims), nn.Sigmoid()) for _ in range(num_types)])
         self.classifier = nn.Sequential(nn.Linear(dims, num_types), nn.Softmax(dim=-1))
@@ -145,78 +264,189 @@ class tgate(nn.Module):
         return cgate
 class residual(nn.Module):
-    def __init__(self, dims: int, head: int, act: str = "silu"):
         super().__init__()
-        self.lna = nn.LayerNorm(dims, bias=False)
         self.atta = attentiona(dims, head)
         self.tgate = tgate(dims, num_types=1)
         self.mlp = nn.Sequential(nn.Linear(dims, dims*4), get_activation(act), nn.Linear(dims*4, dims))
-    def forward(self, x: Tensor, xa = None, mask = None):
-        out = self.atta(x, mask=mask)
-        if  x.shape == out.shape:
-            x = x + out
-        else:
-            x = out
-        if xa is not None:
-            x = x + self.atta(x, xa, mask=None)
         x = x + self.tgate(x)
         x = x + self.mlp(self.lna(x))
         return x
 class processor(nn.Module):
-    def __init__(self, vocab: int, mels: int, ctx: int, dims: int, head: int, layer: int, act: str = "gelu", modal=True):
         super(processor, self).__init__()
         self.ln = nn.LayerNorm(dims)
-        self.token = nn.Embedding(vocab, dims)
         self.audio = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
         self.positions = nn.Parameter(torch.empty(ctx, dims), requires_grad=True)
         self.blend = nn.Parameter(torch.tensor(0.5, device=device, dtype=dtype), requires_grad=True)
-        act_fn = get_activation(act)
         self.encoder = nn.Sequential(
-            nn.Conv1d(1, dims, kernel_size=3, stride=1, padding=1), act_fn,
             nn.Conv1d(dims, dims, kernel_size=3, stride=1, padding=1), act_fn,
             nn.Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
-        self.block = nn.ModuleList([residual(dims, head, act_fn) for _ in range(layer)])
         mask = torch.empty(ctx, ctx).fill_(-np.inf).triu_(1)
         self.register_buffer("mask", mask, persistent=False)
-    def forward(self, x, xa, enc=None, sequential=False, modal=True, blend=False, kv_cache=None) -> Tensor:
-        mask = self.mask[:x.shape[1], :x.shape[1]]
         offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
         x = (self.token(x.long()) + self.positions[offset : offset + x.shape[-1]])
         xa = self.encoder(xa).permute(0, 2, 1)
         xa = xa + self.audio(xa.shape[1], xa.shape[-1], 36000.0).to(device, dtype)
-        for block in chain(self.block or []):
-            xa = block(xa, mask=None)
-            x  = block(x, mask=mask)
-            x  = block(x, xa, mask=None)
-            if blend:
-                if sequential:
-                    y = x
-                else:
-                    a = torch.sigmoid(self.blend)
-                    x = a * x + (1 - a) * y
-            xm = block(torch.cat([x, xa], dim=1), mask=mask) if modal else None
-            x  = block(xm[:, :x.shape[1]], xm[:, x.shape[1]:], mask=None) if modal else x
-            if blend:
-                if sequential:
-                    y = x
-                else:
-                    a = torch.sigmoid(self.blend)
-                    x = a * x + (1 - a) * y
         x = nn.functional.dropout(x, p=0.001, training=self.training)
         x = self.ln(x)
@@ -228,18 +458,19 @@ class Model(nn.Module):
         super().__init__()
         self.param = param
         self.processor = processor(
-            vocab=param.vocab,
             mels=param.mels,
             ctx=param.ctx,
             dims=param.dims,
             head=param.head,
             layer=param.layer,
             act=param.act)
     def forward(self, labels=None, input_ids=None, pitch=None, pitch_tokens=None, spectrogram=None, waveform=None):
         x = input_ids
-        xa = pitch
         enc = {}
         if spectrogram is not None:
@@ -248,6 +479,8 @@ class Model(nn.Module):
             enc["waveform"] = waveform
         if pitch is not None:
             enc["pitch"] = pitch
         logits = self.processor(x, xa, enc)
         loss = None
@@ -259,7 +492,7 @@ class Model(nn.Module):
     def _init_weights(self, module):
         self.init_counts = {
             "Linear": 0, "Conv1d": 0, "LayerNorm": 0, "RMSNorm": 0,
-            "Conv2d": 0, "processor": 0, "attention": 0, "Residual": 0}
         for name, module in self.named_modules():
             if isinstance(module, nn.RMSNorm):
                 nn.init.ones_(module.weight)
@@ -295,121 +528,3 @@ class Model(nn.Module):
         for module_type, count in self.init_counts.items():
             if count > 0:
                 print(f"{module_type}: {count}")
-def prepare_datasets(tokenizer, token, sanity_check=False, sample_rate=16000, streaming=True, load_saved=False, save_dataset=True, cache_dir='E:/hf', extract_args=None, max_ctx=2048):
-    if load_saved:
-        if cache_dir is None:
-            cache_dir = cache_dir
-        else:
-            cache_dir = cache_dir
-        os.makedirs(cache_dir, exist_ok=True)
-        cache_file_train = os.path.join(cache_dir, "train.arrow")
-        cache_file_test = os.path.join(cache_dir, "test.arrow")
-        if os.path.exists(cache_file_train) and os.path.exists(cache_file_test):
-            from datasets import Dataset
-            train_dataset = Dataset.load_from_disk(cache_file_train)
-            test_dataset = Dataset.load_from_disk(cache_file_test)
-            return train_dataset, test_dataset
-    def filter_func(x):
-        return (0 < len(x["transcription"]) < max_ctx and
-                len(x["audio"]["array"]) > 0 and
-                len(x["audio"]["array"]) < max_ctx * 160)
-    raw_train = load_dataset(
-        "google/fleurs", "en_us", token=token, split="train", streaming=streaming).take(1000)
-    raw_test = load_dataset(
-        "google/fleurs", "en_us", token=token, split="test", streaming=streaming).take(100)
-    raw_train = raw_train.filter(filter_func).cast_column("audio", Audio(sampling_rate=sample_rate))
-    raw_test = raw_test.filter(filter_func).cast_column("audio", Audio(sampling_rate=sample_rate))
-    train_dataset = raw_train.map(lambda x: extract_features(x, tokenizer, **extract_args)).remove_columns(["audio", "transcription"])
-    test_dataset = raw_test.map(lambda x: extract_features(x, tokenizer, **extract_args)).remove_columns(["audio", "transcription"])
-    train_dataset.save_to_disk(cache_file_train) if save_dataset is True else None
-    test_dataset.save_to_disk(cache_file_test) if save_dataset is True else None
-    return train_dataset, test_dataset
-def main():
-    token = ""
-    log_dir = os.path.join('D:/newmodel/output/logs/', datetime.now().strftime('%m-%d_%H_%M_%S'))
-    os.makedirs(log_dir, exist_ok=True)
-    tokenizer = setup_tokenizer("D:/newmodel/mod5/tokenizer.json")
-    extract_args = {
-        "waveform": False,
-        "spec": False,
-        "pitch_tokens": False,
-        "pitch": True,
-        "harmonics": False,
-        "aperiodics": False,
-        "phase_mod": False,
-        "crepe": False,
-        "sample_rate": 16000,
-        "hop_length": 256,
-        "mode": "mean",
-        "debug": False,
-    }
-    param = Dimensions(vocab=40000, mels=128, ctx=2048, dims=512, head=4, layer=4, act="swish")
-    train_dataset, test_dataset = prepare_datasets(tokenizer, token, sanity_check=False, sample_rate=16000, streaming=False,
-        load_saved=False, save_dataset=False, cache_dir=None, extract_args=extract_args, max_ctx=param.ctx)
-    model = Model(param).to('cuda')
-    print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
-    print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
-    from functools import partial
-    metrics_fn = partial(compute_metrics, print_pred=True, num_samples=1, tokenizer=tokenizer, model=model)
-    training_args = Seq2SeqTrainingArguments(
-        output_dir=log_dir,
-        per_device_train_batch_size=1,
-        per_device_eval_batch_size=1,
-        max_steps=1000,
-        eval_steps=100,
-        save_steps=100,
-        warmup_steps=10,
-        logging_steps=10,
-        logging_dir=log_dir,
-        logging_strategy="steps",
-        eval_strategy="steps",
-        save_strategy="no",
-        report_to=["tensorboard"],
-        push_to_hub=False,
-        save_total_limit=1,
-        label_names=["labels"],
-        save_safetensors=False,
-        eval_on_start=False,
-        batch_eval_metrics=False,
-        disable_tqdm=False,
-        include_tokens_per_second=True,
-        include_num_input_tokens_seen=True,
-        learning_rate=0.00025,
-        weight_decay=0.025,
-    )
-    optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate, eps=1e-10, weight_decay=training_args.weight_decay, betas=(0.9, 0.999), amsgrad=False, foreach=False, fused=False, capturable=False, differentiable=False, maximize=False)
-    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=training_args.max_steps, eta_min=1e-9, last_epoch=-1)
-    trainer = Seq2SeqTrainer(
-        args=training_args,
-        model=model,
-        train_dataset=train_dataset,
-        eval_dataset=test_dataset,
-        data_collator=DataCollator(tokenizer=tokenizer),
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        compute_metrics=metrics_fn,
-        optimizers=(optimizer, scheduler)
-    )
-    model.init_weights()
-    trainer.train()
-if __name__ == "__main__":
-    main()

 import numpy as np
 from dataclasses import dataclass
 from einops import rearrange
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
 logging.basicConfig(level=logging.ERROR)
+def scaled_relu(x, sequence_length):
+    relu_output = torch.relu(x)
+    return relu_output / sequence_length
+def taylor_softmax(x, order=2):
+    tapprox = 1.0
+    for i in range(1, order + 1):
+        factorial_i = torch.exp(torch.lgamma(torch.tensor(i + 1, dtype=torch.float32)))
+        tapprox += x**i / factorial_i
+    return tapprox / torch.sum(tapprox, dim=-1, keepdim=True)
+def there_is_a(a):
+    return a is not None
+def AorB(a, b):
+    return a if there_is_a(a) else b
 def sinusoids(ctx, dims, max_tscale=10000):
     assert dims % 2 == 0
     pos = torch.log(torch.tensor(float(max_tscale))) / (dims // 2 - 1)
     }
     return act_map.get(act, nn.GELU())
 @dataclass
 class Dimensions:
+    tokens: int
     mels: int
     ctx: int
     dims: int
     head: int
+    head_dim: int
     layer: int
     act: str
+def vectorized_taylor_sine(x, order=5):
+    original_shape = x.shape
+    x = x.flatten(0, -2)
+    exponents = torch.arange(1, order + 1, 2, device=x.device, dtype=torch.float32)
+    x_powers = x.unsqueeze(-1) ** exponents
+    factorials = torch.exp(torch.lgamma(exponents + 1))
+    signs = (-1)**(torch.arange(0, len(exponents), device=x.device, dtype=torch.float32))
+    terms = signs * x_powers / factorials
+    result = terms.sum(dim=-1)
+    return result.view(original_shape)
+def vectorized_taylor_cosine(x, order=5):
+    original_shape = x.shape
+    x = x.flatten(0, -2)
+    exponents = torch.arange(0, order + 1, 2, device=x.device, dtype=torch.float32)
+    x_powers = x.unsqueeze(-1) ** exponents
+    factorials = torch.exp(torch.lgamma(exponents + 1))
+    signs = (-1)**(torch.arange(0, len(exponents), device=x.device, dtype=torch.float32))
+    terms = signs * x_powers / factorials
+    result = terms.sum(dim=-1)
+    return result.view(original_shape)
 class rotary(nn.Module):
     def __init__(self, dims, head):
         super(rotary, self).__init__()
         self.dims = dims
         self.head = head
         self.head_dim = dims // head
+        self.taylor_order = 10
+        self.theta = nn.Parameter((torch.tensor(360000, device=device, dtype=dtype)), requires_grad=False)
         self.register_buffer('freqs_base', self._compute_freqs_base(), persistent=False)
     def _compute_freqs_base(self):
         mel_scale = torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1
         return 200 * mel_scale / 1000
+    def forward(self, x) -> torch.Tensor:
+        positions = (torch.arange(0, x.shape[2], device=x.device))
+        freqs = (self.theta / 220.0) * self.freqs_base
+        freqs = positions[:, None] * freqs
+        freqs_rescaled = (freqs + torch.pi) % (2 * torch.pi) - torch.pi
+        with torch.autocast(device_type="cuda", enabled=False):
+            cos = vectorized_taylor_cosine(freqs_rescaled, order=self.taylor_order)
+            sin = vectorized_taylor_sine(freqs_rescaled, order=self.taylor_order)
+            rotary_dim = cos.shape[-1]
+            x_rot, x_pass = x[..., :rotary_dim], x[..., rotary_dim:]
+            x_embed = (x_rot * cos) + (rotate_half(x_rot) * sin)
+            x_embed = torch.cat([x_embed, x_pass], dim=-1)
+            return x_embed.type_as(x)
+def taylor_sine(x, order=5):
+    result = torch.zeros_like(x)
+    for i in range(order + 1):
+        if i % 2 == 1:
+            term = x**i / torch.exp(torch.lgamma(torch.tensor(i + 1, dtype=torch.float32)))
+            if (i // 2) % 2 == 1:
+                result -= term
+            else:
+                result += term
+    return result
+def taylor_cosine(x, order=5):
+    result = torch.zeros_like(x)
+    for i in range(order + 1):
+        if i % 2 == 0:
+            term = x**i / torch.exp(torch.lgamma(torch.tensor(i + 1, dtype=torch.float32)))
+            if (i // 2) % 2 == 1:
+                result -= term
+            else:
+                result += term
+    return result
+class rotarya(nn.Module):
+    def __init__(self, dims, head):
+        super(rotary, self).__init__()
+        self.dims = dims
+        self.head = head
+        self.head_dim = dims // head
+        self.taylor_order = 5
+        self.theta = nn.Parameter((torch.tensor(1600, device=device, dtype=dtype)), requires_grad=False)
+        self.register_buffer('freqs_base', self._compute_freqs_base(), persistent=False)
+    def _compute_freqs_base(self):
+        mel_scale = torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1
+        return 200 * mel_scale / 1000
+    def forward(self, x) -> torch.Tensor:
+        positions = (torch.arange(0, x.shape[2], device=x.device))
+        freqs = (self.theta / 220.0) * self.freqs_base
+        freqs = positions[:, None] * freqs
+        freqs = (freqs + torch.pi) % (2 * torch.pi) - torch.pi
+        with torch.autocast(device_type="cuda", enabled=False):
+            cos = taylor_cosine(freqs, order=self.taylor_order)
+            sin = taylor_sine(freqs, order=self.taylor_order)
+            rotary_dim = cos.shape[-1]
+            x_rot, x_pass = x[..., :rotary_dim], x[..., rotary_dim:]
+            x_embed = (x_rot * cos) + (rotate_half(x_rot) * sin)
+            x_embed = torch.cat([x_embed, x_pass], dim=-1)
+            return x_embed.type_as(x)
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# class rotary(nn.Module):
+#     def __init__(self, dims, head):
+#         super(rotary, self).__init__()
+#         self.dims = dims
+#         self.head = head
+#         self.head_dim = dims // head
+#         self.theta = nn.Parameter((torch.tensor(1600, device=device, dtype=dtype)), requires_grad=False)
+#         # self.register_buffer('freqs_base', self._compute_freqs_base(), persistent=False)
+#     def _compute_freqs_base(self):
+#         mel_scale = torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 4000/200)), self.head_dim // 2, device=device, dtype=dtype) / 2595) - 1
+#         return 200 * mel_scale / 1000
+#     def forward(self, x) -> Tensor:
+#         positions = (torch.arange(0, x.shape[2], device=x.device))
+#         freqs = (self.theta / 220.0) * self._compute_freqs_base()
+#         freqs = positions[:, None] * freqs
+#         with torch.autocast(device_type="cuda", enabled=False):
+#             freqs = torch.polar(torch.ones_like(freqs), freqs)
+#             x1 = x[..., :freqs.shape[-1]*2]
+#             x2 = x[..., freqs.shape[-1]*2:]
+#             orig_shape = x1.shape
+#             x1 = x1.float().reshape(*x1.shape[:-1], -1, 2).contiguous()
+#             x1 = torch.view_as_complex(x1) * freqs
+#             x1 = torch.view_as_real(x1).flatten(-2)
+#             x1 = x1.view(orig_shape)
+#             return torch.cat([x1.type_as(x), x2], dim=-1)
 class attentiona(nn.Module):
     def __init__(self, dims: int, head: int):
         super().__init__()
         self.head = head
         self.dims = dims
         self.head_dim = dims // head
         self.zmax = 1e-5
         self.zero = nn.Parameter(torch.tensor(1e-4, device=device, dtype=dtype), requires_grad=False)
+        self.q = nn.Linear(dims, dims)
         self.kv = nn.Linear(dims, dims * 2, bias=False)
+        self.out = nn.Linear(dims, dims)
         self.lna = nn.LayerNorm(dims)
+        self.lnb = nn.LayerNorm(dims // head)
         self.rope = rotary(dims, head)
+    def forward(self, x, xa = None, mask = None,  positions = None):
         zero = self.zero
+        q = self.q(x)
         k, v = self.kv(self.lna(x if xa is None else xa)).chunk(2, dim=-1)
         q, k, v = map(lambda t: rearrange(t, 'b c (h d) -> b h c d', h = self.head), (q, k, v))
         scale = q.shape[-1] ** -0.5
+        qk = einsum('b h k d, b h q d -> b h k q', self.lnb(q), self.lnb(k)) * scale
         scale = torch.ones_like(k[:, :, :, 0])
         zero = torch.clamp(F.softplus(zero), 1e-6, 1e-5)
         return out
 class tgate(nn.Module):
+    def __init__(self, dims, num_types=1):
         super().__init__()
         self.gates = nn.ModuleList([nn.Sequential(nn.Linear(dims, dims), nn.Sigmoid()) for _ in range(num_types)])
         self.classifier = nn.Sequential(nn.Linear(dims, num_types), nn.Softmax(dim=-1))
         return cgate
 class residual(nn.Module):
+    def __init__(self, dims: int, head: int, layer = 2, act = "silu"):
         super().__init__()
+        self.lna = nn.LayerNorm(dims, bias=False)
         self.atta = attentiona(dims, head)
+        self.dsl = skip_layer(dims, head, layer=2)
         self.tgate = tgate(dims, num_types=1)
         self.mlp = nn.Sequential(nn.Linear(dims, dims*4), get_activation(act), nn.Linear(dims*4, dims))
+    def forward(self, x: Tensor, xa = None, mask = None, positions=None):
+        # log = {}
+        x = x + self.atta(self.lna(x), xa=xa, mask=mask)
+        x, _ =  self.dsl(self.lna(x), xa=xa, mask=mask) # _ outputs logs for jumps
         x = x + self.tgate(x)
         x = x + self.mlp(self.lna(x))
+        # print(results['jumps'])
+        # log['jumps'] = l
         return x
+class skip_layer(nn.Module):
+    def __init__(self, dims, head, layer, threshold=0.1):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        self.layer = layer
+        self.threshold = threshold
+        self.dims = dims
+        self.head = head
+        self.head_dim = dims // head
+        self.attention_module = attentiona(dims, head)
+        self.node_predictors = nn.ModuleList([
+            nn.Sequential(
+                nn.LayerNorm(dims),
+                nn.Linear(dims, 1),
+                nn.Sigmoid()
+            ) for _ in range(layer)
+        ])
+        for i in range(layer):
+            self.layers.append(nn.ModuleDict({
+                'ln': nn.LayerNorm(dims),
+                'gate': nn.Sequential(nn.Linear(dims, 1), nn.Sigmoid()),
+                'adapter': nn.Linear(dims, dims) if i % 2 == 0 else None
+            }))
+        self.policy_net = nn.Sequential(
+            nn.Linear(dims, 128),
+            nn.ReLU(),
+            nn.Linear(128, 3))
+        self.jump_weights = nn.Parameter(torch.tensor([0.1, 0.05, 0.01]))
+        n_mlp = dims * 4
+        self.mlp_gate = nn.Sequential(nn.Linear(dims, 1), nn.Sigmoid())
+        self.mlp = nn.Sequential(nn.Linear(dims, n_mlp), nn.GELU(), nn.Linear(n_mlp, dims))
+        self.mlp_ln =nn.LayerNorm(dims)
+        self.working_memory = nn.Parameter(torch.zeros(1, 1, dims))
+        self.memory_gate = nn.Sequential(nn.Linear(dims, 1), nn.Sigmoid())
+    def _calculate_shared_attention(self, x, mask=None):
+        return self.attention_module(x, xa=x, mask=None)
+    def predict_node_importance(self, x, layer_idx):
+        importance = self.node_predictors[layer_idx](x)
+        return (importance > self.threshold).float()
+    def forward(self, x, xa=None, mask=None):
+        batch, ctx = x.shape[:2]
+        working_memory = self.working_memory.expand(batch, -1, -1)
+        original_x = x
+        pooled_representation = x.mean(dim=1)
+        policy_logits = self.policy_net(pooled_representation)
+        policy = F.softmax(policy_logits, dim=-1)
+        jump_history = []
+        i = 0
+        while i < self.layer:
+            layer = self.layers[i]
+            node_importance = self.predict_node_importance(x, i)
+            if node_importance.mean() < 0.2 and i > 0:
+                i += 1
+                jump_history.append(i)
+                continue
+            norm_x = layer['ln'](x)
+            importance_mask_base = node_importance.unsqueeze(1).contiguous()
+            combined_custom_mask = None
+            if mask is None:
+                combined_custom_mask = importance_mask_base
+            else:
+                combined_custom_mask = mask.contiguous() * importance_mask_base
+            if node_importance.mean() > 0.3:
+                attn_output = self._calculate_shared_attention(norm_x, mask=combined_custom_mask.contiguous())
+                if layer['adapter'] is not None:
+                    attn_output = layer['adapter'](attn_output)
+                gate_value = layer['gate'](norm_x)
+                x = x + gate_value * attn_output
+                memory_gate = self.memory_gate(x)
+                working_memory = memory_gate * working_memory + (1 - memory_gate) * x.mean(dim=1, keepdim=True)
+            jump_prob = policy[:, 1] if i < self.layer - 1 else torch.zeros_like(policy[:, 1])
+            should_jump = (torch.rand_like(jump_prob) < jump_prob).any()
+            if should_jump:
+                jump_length = torch.multinomial(policy, 1)[:, 0].max().item() + 1
+                i_next = min(i + jump_length, self.layer - 1)
+                skip_weight = self.jump_weights[min(jump_length-1, 2)]
+                x = x + skip_weight * original_x + (1-skip_weight) * working_memory
+                i = i_next
+                jump_history.append(i)
+            else:
+                i += 1
+        mlp_importance = self.mlp_gate(x)
+        mlp_output = self.mlp(self.mlp_ln(x))
+        x = x + mlp_importance * mlp_output
+        return x, {'jumps': jump_history}
 class processor(nn.Module):
+    def __init__(self, tokens, mels, ctx, dims, head, head_dim, layer, act):
         super(processor, self).__init__()
+        act_fn = get_activation(act)
         self.ln = nn.LayerNorm(dims)
+        self.token = nn.Embedding(tokens, dims)
         self.audio = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
         self.positions = nn.Parameter(torch.empty(ctx, dims), requires_grad=True)
         self.blend = nn.Parameter(torch.tensor(0.5, device=device, dtype=dtype), requires_grad=True)
         self.encoder = nn.Sequential(
+            nn.Conv1d(mels, dims, kernel_size=3, stride=1, padding=1), act_fn,
             nn.Conv1d(dims, dims, kernel_size=3, stride=1, padding=1), act_fn,
             nn.Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
+        modal = False
+        self.block = nn.ModuleList([residual(dims, head, layer, act_fn) for _ in range(layer)]) if modal else None
+        self.res = residual(dims, head, layer, act_fn)
         mask = torch.empty(ctx, ctx).fill_(-np.inf).triu_(1)
         self.register_buffer("mask", mask, persistent=False)
+    def init_memory(self, batch):
+        return torch.zeros(batch, 1, self.dims).to(next(self.parameters()).device)
+    def update_memory(self, x, working_memory):
+        return (x + working_memory) / 2
+    def forward(self, x, xa, enc=None, sequential=False, modal=False, blend=False, kv_cache=None) -> Tensor:
+        mask = self.mask[:x.shape[1], :x.shape[1]]
         offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
         x = (self.token(x.long()) + self.positions[offset : offset + x.shape[-1]])
         xa = self.encoder(xa).permute(0, 2, 1)
         xa = xa + self.audio(xa.shape[1], xa.shape[-1], 36000.0).to(device, dtype)
+        xa = self.res(xa, None, None)
+        x  = self.res(x, None, mask)
+        x  = self.res(x, xa, None)
+        if blend:
+            if sequential:
+                y = x
+            else:
+                a = torch.sigmoid(self.blend)
+                x = a * x + (1 - a) * y
+        if modal:
+            for block in chain(self.block or []):
+                xm = block(torch.cat([x, xa], dim=1), mask=mask) if modal else None
+                x  = block(xm[:, :x.shape[1]], xm[:, x.shape[1]:], mask=None) if modal else x
+                if blend:
+                    if sequential:
+                        y = x
+                    else:
+                        a = torch.sigmoid(self.blend)
+                        x = a * x + (1 - a) * y
         x = nn.functional.dropout(x, p=0.001, training=self.training)
         x = self.ln(x)
         super().__init__()
         self.param = param
         self.processor = processor(
+            tokens=param.tokens,
             mels=param.mels,
             ctx=param.ctx,
             dims=param.dims,
             head=param.head,
+            head_dim=param.head_dim,
             layer=param.layer,
             act=param.act)
     def forward(self, labels=None, input_ids=None, pitch=None, pitch_tokens=None, spectrogram=None, waveform=None):
         x = input_ids
+        xa = AorB(pitch, spectrogram)
         enc = {}
         if spectrogram is not None:
             enc["waveform"] = waveform
         if pitch is not None:
             enc["pitch"] = pitch
+        if pitch_tokens is not None:
+            enc["ptokens"] = pitch_tokens
         logits = self.processor(x, xa, enc)
         loss = None
     def _init_weights(self, module):
         self.init_counts = {
             "Linear": 0, "Conv1d": 0, "LayerNorm": 0, "RMSNorm": 0,
+            "Conv2d": 0, "processor": 0, "attentiona": 0, "Residual": 0}
         for name, module in self.named_modules():
             if isinstance(module, nn.RMSNorm):
                 nn.init.ones_(module.weight)
         for module_type, count in self.init_counts.items():
             if count > 0:
                 print(f"{module_type}: {count}")