Sin2pi
/

asr-model

@@ -1,5 +1,6 @@
 import warnings
 import logging
 from itertools import chain
 import torch
@@ -7,38 +8,18 @@ from torch import nn, Tensor, einsum
 from typing import Optional
 import numpy as np
 from dataclasses import dataclass
-from torch.nn.functional import scaled_dot_product_attention
 from einops import rearrange
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
 logging.basicConfig(level=logging.ERROR)
-def sinusoids(ctx, dims, max_tscale=10000):
-    assert dims % 2 == 0
-    pos = torch.log(torch.tensor(float(max_tscale))) / (dims // 2 - 1)
-    tscales = torch.exp(-pos * torch.arange(dims // 2, device=device, dtype=torch.float32))
-    scaled = torch.arange(ctx, device=device, dtype=torch.float32).unsqueeze(1) * tscales.unsqueeze(0)
-    position = torch.cat([torch.sin(scaled), torch.cos(scaled)], dim=1)
-    positional_embedding = nn.Parameter(position, requires_grad=True)
-    return positional_embedding
-def get_activation(act: str) -> nn.Module:
-    act_map = {
-        "gelu": nn.GELU(),
-        "relu": nn.ReLU(),
-        "sigmoid": nn.Sigmoid(),
-        "tanh": nn.Tanh(),
-        "swish": nn.SiLU(),
-        "tanhshrink": nn.Tanhshrink(),
-        "softplus": nn.Softplus(),
-        "softshrink": nn.Softshrink(),
-        "leaky_relu": nn.LeakyReLU(),
-        "elu": nn.ELU()
-    }
-    return act_map.get(act, nn.GELU())
 def there_is_a(val):
     return val is not None
@@ -105,12 +86,22 @@ class rotary(nn.Module):
         x1 = x1.view(orig_shape)
         return torch.cat([x1.type_as(x), x2], dim=-1)
-def calculate_attention(q, k, v, mask=None, temp=1.0):
     scaled_q = q
     if temp != 1.0 and temp > 0:
         scaled_q = q * (1.0 / temp)**.5
-        print(temp)
-    out = scaled_dot_product_attention(scaled_q, k, v, is_causal=mask is not None and q.shape[1] > 1)
     return out
 class LocalOut(nn.Module):
@@ -128,27 +119,24 @@ class LocalOut(nn.Module):
         return attn_output.transpose(1, 2).contiguous().view(batch, ctx, self.dims)
 class attentionb(nn.Module):
-    def __init__(self, dims: int, head: int, max_iter: int = 3,
-    threshold: float = 0.01, factor: float = 0.1, dropout: float = 0.1, temp = 1.0):
         super(attentionb, self).__init__()
         self.head = head
         self.dims = dims
         self.head_dim = dims // head
-        self.win = 0
         self.que = nn.Linear(dims, dims, bias=False)
         self.kv = nn.Linear(dims, dims * 2, bias=False)
         self.out = nn.Linear(dims, dims, bias=False)
         self.lna = nn.LayerNorm(dims)
-        self.lnb = nn.LayerNorm(self.head_dim)
         self.rope = rotary(dims, head)
         self.max_iter = max_iter
         self.threshold = nn.Parameter(torch.tensor(threshold), requires_grad=True)
         self.temp = nn.Parameter(torch.tensor(temp), requires_grad=True)
-        self.factor = nn.Parameter(torch.tensor(factor), requires_grad=True)
         self.local = LocalOut(dims, head)
     def update_win(self, win_size=None):
@@ -163,7 +151,7 @@ class attentionb(nn.Module):
     def _focus(self, x, xa = None, mask = None, win_size=None):
         q = self.que(self.lna(x))
-        k, v = self.kv(self.lna(x if not xa else xa)).chunk(2, dim=-1)
         q, k, v = map(lambda t: rearrange(t, 'b c (h d) -> b h c d', h = self.head), (q, k, v))
         self.scale = q.shape[-1] ** -0.35
@@ -171,17 +159,14 @@ class attentionb(nn.Module):
         k = self.rope(k)
         iteration = 0
-        temp = self.temp
         prev_out = torch.zeros_like(q)
         attn_out = torch.zeros_like(q)
         threshold = self.threshold
-        factor = self.factor
         curq = q #if curq is None else curq
         while iteration < self.max_iter:
-            eff_span = min(curq.shape[1], k.shape[1])
-            if xa is not None:
-                eff_span = min(eff_span, xa.shape[1])
             if eff_span == 0:
                 break
@@ -206,24 +191,18 @@ class attentionb(nn.Module):
             iter_out = torch.zeros_like(curq)
             iter_out[:, :, :eff_span, :] = attn_iter
             diff = torch.abs(iter_out - prev_out).mean()
-            dthresh = threshold + factor * diff
-            if diff < dthresh and iteration > 0:
                 attn_out = iter_out
                 break
             prev_out = iter_out.clone()
             curq = curq + iter_out
             attn_out = iter_out
-            # if win_size is not None:
-            #     if win_size > self.win:
-            #         temp += 0.005
-            #     else:
-            #         temp -= 0.005
-            #     self.win = win_size
             iteration += 1
-        out = attn_out.permute(0, 2, 1, 3).flatten(start_dim=2)
-        return out
     def _slide_win_local(self, x, mask = None) -> Tensor:
@@ -260,26 +239,45 @@ class attentionb(nn.Module):
     def forward(self, x, xa = None, mask = None):
             x = self._slide_win_local(x, mask=None)
             xa = self._slide_win_local(xa, mask=None)
-            output = self._focus(x, xa, mask=None)
-            return self.out(output)
 class attentiona(nn.Module):
-    def __init__(self, dims: int, head: int, dropout_rate: float = 0.1, cross_talk=False):
         super().__init__()
         self.head = head
         self.dims = dims
-        self.cross_talk = cross_talk
         self.que = nn.Linear(dims, dims, bias=False)
         self.kv = nn.Linear(dims, dims * 2, bias=False)
         self.out = nn.Linear(dims, dims, bias=False)
         self.ln = nn.LayerNorm(dims)
         self.rope = rotary(dims, head)
-        self.x = nn.Conv2d(head, head, 1, bias = False) if cross_talk else None
-        self.xa = nn.Conv2d(head, head, 1, bias = False) if cross_talk else None
     def forward(self, x, xa = None, mask = None):
         q = self.que(self.ln(x))
@@ -292,16 +290,13 @@ class attentiona(nn.Module):
         k = self.rope(k)
         qk = einsum('b h k d, b h q d -> b h k q', q, k) * scale
-        if there_is_a(mask):
-            i, j = qk.shape[-2:]
-            mask = torch.ones(i, j, device = q.device, dtype = torch.bool).triu(j - i + 1)
-            qk = qk.masked_fill(mask,  -torch.finfo(qk.dtype).max)
-        qk = torch.nn.functional.softmax(qk, dim=-1)
         wv = einsum('b h k q, b h q d -> b h k d', qk, v)
         wv = rearrange(wv, 'b h c d -> b c (h d)')
         out = self.out(wv)
 class attentiond(nn.Module):
     def __init__(self, dims: int, head: int):
@@ -324,11 +319,11 @@ class attentiond(nn.Module):
         qk, v = self.kv(self.ln(x)).chunk(2, dim=-1)
         qka, va = self.kv(self.ln(x if xa is None else xa)).chunk(2, dim=-1)
         qk, qka, v, va = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.head), (qk, qka, v, va))
-        qk = einsum('b h i d, b h j d -> b h i j', qk, qka)
         if there_is_a(mask):
-            i, j = qk.shape[-2:]
-            mask = torch.ones(i, j, device=device, dtype=torch.bool).triu(j - i + 1)
-            qk = qk.masked_fill(mask, -torch.finfo(qk.dtype).max)
         x = qk.softmax(dim = -1)
         xa = qk.softmax(dim = -2)
         x = self.x(x)
@@ -337,15 +332,13 @@ class attentiond(nn.Module):
         xa = einsum('b h j i, b h j d -> b h i d', xa, v)
         x, xa = map(lambda t: rearrange(t, 'b h n d -> b n (h d)'), (x, xa))
         out = self.out(x)
-        outxa = self.out(xa)
-        return out, outxa, qk
 class tgate(nn.Module):
     def __init__(self, dims, num_types=4):
         super().__init__()
-        self.gates = nn.ModuleList([nn.Sequential(nn.Linear(dims, 1), nn.Sigmoid()) for _ in range(num_types)])
-        self.classifier = nn.Sequential(nn.Linear(dims, num_types), nn.Softmax(dim=-1))
     def forward(self, x):
         types = self.classifier(x)
         gates = torch.stack([gate(x) for gate in self.gates], dim=-1)
@@ -359,32 +352,34 @@ class residual(nn.Module):
         self.lna = nn.LayerNorm(dims, bias=False)
         self.atta = attentiona(dims, head)
         self.attb = attentionb(dims, head, max_iter=1)
-        # self.attc = attentiona(dims, head, cross_talk=True)
-        self.tgate = tgate(dims, num_types=4)
         self.mlp = nn.Sequential(nn.Linear(dims, dims*4), get_activation(act), nn.Linear(dims*4, dims))
-    def forward(
-        self,
-        x: Tensor,
-        xa: Optional[Tensor] = None,
-        mask: Optional[Tensor] = None,
-    ):
-        x = x + self.atta(x, mask=mask)[0]
         if xa is not None:
-            x = x + self.attb(x, xa, mask=None)
         x = x + self.tgate(x)
-        x = x + self.mlp(self.lna(x))
         return x
 class processor(nn.Module):
     def __init__(self, vocab: int, mels: int, ctx: int, dims: int, head: int, layer: int, act: str = "gelu"):
         super(processor, self).__init__()
         self.ln = nn.LayerNorm(dims)
         self.token = nn.Embedding(vocab, dims)
         self.audio = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
         self.positions = nn.Parameter(torch.empty(ctx, dims), requires_grad=True)
         act_fn = get_activation(act)
         self.encoder = nn.Sequential(
@@ -393,12 +388,13 @@ class processor(nn.Module):
             nn.Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
         self.blocka = nn.ModuleList([residual(dims, head, act_fn) for _ in range(layer)])
-        self.blockm = nn.ModuleList([residual(dims, head, act_fn) for _ in range(layer // 2)])
         mask = torch.empty(ctx, ctx).fill_(-np.inf).triu_(1)
         self.register_buffer("mask", mask, persistent=False)
-    def forward(self, x, xa, sequential=False, modal=False, kv_cache=None) -> Tensor:
         if xa.dim() == 2:
             xa = xa.unsqueeze(0)
@@ -413,10 +409,22 @@ class processor(nn.Module):
             xa = block(xa, mask=None)
             x  = block(x, mask=self.mask)
             x  = block(x, xa, mask=None)
         for block in chain(self.blockm or []):
             xm = block(torch.cat([x, xa], dim=1), torch.cat([x, xa], dim=1), mask=None) if modal else None
             x  = block(xm[:, :x.shape[1]], xm[:, x.shape[1]:], mask=None) if modal else x
         x = nn.functional.dropout(x, p=0.001, training=self.training)
         x = self.ln(x)
@@ -447,7 +455,7 @@ class Model(nn.Module):
     def adjust_window(self, loss, ctx):
         self.win_size = ((ctx // self.param.head))
         if loss < self.best_loss:
-            win_size = (self.win_size * self.factor) #.clamp(0, ctx - 1)
         else:
             win_size = (self.win_size // self.factor).clamp(0, self.win_size - 1)
         self.win_size = win_size
@@ -455,15 +463,25 @@ class Model(nn.Module):
         self.update(win_size)
         return win_size
-    def forward(self, labels=None, input_ids=None, pitch=None, pitch_tokens=None, spectrogram=None):
         x = input_ids
-        xa = pitch if pitch is not None else spectrogram         # xb = pitch_tokens
-        logits = self.processor(x, xa)
         loss = None
         if labels is not None:
             loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.shape[-1]), labels.view(-1), ignore_index=0)
-        self.adjust_window(loss=loss.item(), ctx=xa.shape[2])
         return {"logits": logits, "loss": loss}
     def _init_weights(self, module):
@@ -522,3 +540,116 @@ class Model(nn.Module):
                 hooks.append(layer.v.register_forward_hook(save_to_cache))
         self.processor.apply(install_hooks)
         return cache, hooks

 import warnings
+import os
 import logging
 from itertools import chain
 import torch
 from typing import Optional
 import numpy as np
 from dataclasses import dataclass
 from einops import rearrange
+from datasets import load_dataset, Audio
+from echoutils import extract_features, setup_tokenizer, compute_metrics, DataCollator, preprocess_logits_for_metrics, sinusoids, get_activation
+from datetime import datetime
+from transformers.trainer_seq2seq import Seq2SeqTrainer
+from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 dtype = torch.float32
 warnings.filterwarnings("ignore")
 logging.basicConfig(level=logging.ERROR)
 def there_is_a(val):
     return val is not None
         x1 = x1.view(orig_shape)
         return torch.cat([x1.type_as(x), x2], dim=-1)
+def calculate_attention(q, k, v, mask=None, temp=1.0, pytorch=True):
     scaled_q = q
     if temp != 1.0 and temp > 0:
         scaled_q = q * (1.0 / temp)**.5
+    if pytorch:
+        out = torch.nn.functional.scaled_dot_product_attention(scaled_q, k, v, is_causal=mask is not None and q.shape[1] > 1)
+    else:
+        scale = q.shape[-1] ** -0.35
+        qk = (q * scale) @ (k * scale).transpose(-1, -2)
+        if there_is_a(mask):
+            mask = mask[:qk.shape[2], :qk.shape[2]]
+            qk = qk.masked_fill(mask.bool(), -torch.inf)
+        qk = qk.float()
+        w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype)
+        out = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
+        qk = qk.detach()
     return out
 class LocalOut(nn.Module):
         return attn_output.transpose(1, 2).contiguous().view(batch, ctx, self.dims)
 class attentionb(nn.Module):
+    def __init__(self, dims: int, head: int, max_iter: int = 3, threshold: float = 0.5, temp = 1.0):
         super(attentionb, self).__init__()
         self.head = head
         self.dims = dims
         self.head_dim = dims // head
         self.que = nn.Linear(dims, dims, bias=False)
         self.kv = nn.Linear(dims, dims * 2, bias=False)
         self.out = nn.Linear(dims, dims, bias=False)
         self.lna = nn.LayerNorm(dims)
+        self.lnb = nn.LayerNorm(dims // head)
         self.rope = rotary(dims, head)
         self.max_iter = max_iter
         self.threshold = nn.Parameter(torch.tensor(threshold), requires_grad=True)
         self.temp = nn.Parameter(torch.tensor(temp), requires_grad=True)
         self.local = LocalOut(dims, head)
     def update_win(self, win_size=None):
     def _focus(self, x, xa = None, mask = None, win_size=None):
         q = self.que(self.lna(x))
+        k, v = self.kv(self.lna(x if xa is None else xa)).chunk(2, dim=-1)
         q, k, v = map(lambda t: rearrange(t, 'b c (h d) -> b h c d', h = self.head), (q, k, v))
         self.scale = q.shape[-1] ** -0.35
         k = self.rope(k)
         iteration = 0
+        temp = self.temp.item()
         prev_out = torch.zeros_like(q)
         attn_out = torch.zeros_like(q)
         threshold = self.threshold
         curq = q #if curq is None else curq
         while iteration < self.max_iter:
+            eff_span = curq.shape[2]
             if eff_span == 0:
                 break
             iter_out = torch.zeros_like(curq)
             iter_out[:, :, :eff_span, :] = attn_iter
             diff = torch.abs(iter_out - prev_out).mean()
+            if diff < threshold and iteration > 0:
                 attn_out = iter_out
                 break
             prev_out = iter_out.clone()
             curq = curq + iter_out
             attn_out = iter_out
             iteration += 1
+            temp -= 0.005
+        return rearrange(attn_out, 'b h c d -> b c (h d)')
     def _slide_win_local(self, x, mask = None) -> Tensor:
     def forward(self, x, xa = None, mask = None):
             x = self._slide_win_local(x, mask=None)
             xa = self._slide_win_local(xa, mask=None)
+            out = self._focus(x, xa, mask=None)
+            return self.out(out)
+def scaled_relu(x, sequence_length):
+    relu_output = torch.relu(x)
+    return relu_output / sequence_length
+def taylor_softmax(x, order=2):
+    taylor_approx = 1.0
+    for i in range(1, order + 1):
+        factorial_i = torch.exp(torch.lgamma(torch.tensor(i + 1, dtype=torch.float32)))
+        taylor_approx += x**i / factorial_i
+    return taylor_approx / torch.sum(taylor_approx, dim=-1, keepdim=True)
+def taylor_softmax_2nd_order(x):
+    exp_approx = 1 + x + (x**2) / 2
+    return exp_approx / torch.sum(exp_approx, dim=-1, keepdim=True)
+def cos_sim(q: Tensor, k: Tensor, v: Tensor, mask) -> Tensor:
+    q_norm = torch.nn.functional.normalize(q, dim=-1, eps=1e-12)
+    k_norm = torch.nn.functional.normalize(k, dim=-1, eps=1e-12)
+    qk_cosine = torch.matmul(q_norm, k_norm.transpose(-1, -2))
+    qk_cosine = qk_cosine + mask
+    weights = F.softmax(qk_cosine, dim=-1)
+    out = torch.matmul(weights, v)
+    return out
 class attentiona(nn.Module):
+    def __init__(self, dims: int, head: int, dropout_rate: float = 0.1):
         super().__init__()
         self.head = head
         self.dims = dims
         self.que = nn.Linear(dims, dims, bias=False)
         self.kv = nn.Linear(dims, dims * 2, bias=False)
         self.out = nn.Linear(dims, dims, bias=False)
         self.ln = nn.LayerNorm(dims)
         self.rope = rotary(dims, head)
     def forward(self, x, xa = None, mask = None):
         q = self.que(self.ln(x))
         k = self.rope(k)
         qk = einsum('b h k d, b h q d -> b h k q', q, k) * scale
+        # qk = torch.nn.functional.softmax(qk, dim=-1)
+        qk = taylor_softmax(qk, order=2)
         wv = einsum('b h k q, b h q d -> b h k d', qk, v)
         wv = rearrange(wv, 'b h c d -> b c (h d)')
         out = self.out(wv)
+        return out
 class attentiond(nn.Module):
     def __init__(self, dims: int, head: int):
         qk, v = self.kv(self.ln(x)).chunk(2, dim=-1)
         qka, va = self.kv(self.ln(x if xa is None else xa)).chunk(2, dim=-1)
         qk, qka, v, va = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.head), (qk, qka, v, va))
+        qk = einsum('b h q d, b h k d -> b h q k', qk, qka)
         if there_is_a(mask):
+            mask = mask[:qk.shape[2], :qk.shape[2]]
+            qk = qk.masked_fill(mask.bool(), -torch.inf)
         x = qk.softmax(dim = -1)
         xa = qk.softmax(dim = -2)
         x = self.x(x)
         xa = einsum('b h j i, b h j d -> b h i d', xa, v)
         x, xa = map(lambda t: rearrange(t, 'b h n d -> b n (h d)'), (x, xa))
         out = self.out(x)
+        return out
 class tgate(nn.Module):
     def __init__(self, dims, num_types=4):
         super().__init__()
+        self.gates = nn.ModuleList([nn.Sequential(nn.Linear(dims, dims), nn.Sigmoid()) for _ in range(num_types)])
+        self.classifier = nn.Sequential(nn.Linear(dims, num_types), torch.nn.functional.Softmax(dim=-1))
     def forward(self, x):
         types = self.classifier(x)
         gates = torch.stack([gate(x) for gate in self.gates], dim=-1)
         self.lna = nn.LayerNorm(dims, bias=False)
         self.atta = attentiona(dims, head)
         self.attb = attentionb(dims, head, max_iter=1)
+        self.attc = attentiond(dims, head)
+        self.tgate = tgate(dims, num_types=1)
         self.mlp = nn.Sequential(nn.Linear(dims, dims*4), get_activation(act), nn.Linear(dims*4, dims))
+    def forward(self, x: Tensor, xa = None, mask = None):
+        out = self.atta(x, mask=mask)
+        if  x.shape == out.shape:
+            x = x + out
+        else:
+            x = out
         if xa is not None:
+            x = x + self.atta(x, xa, mask=None)
         x = x + self.tgate(x)
+        x = x + self.mlp(self.lna(x))
         return x
 class processor(nn.Module):
     def __init__(self, vocab: int, mels: int, ctx: int, dims: int, head: int, layer: int, act: str = "gelu"):
         super(processor, self).__init__()
         self.ln = nn.LayerNorm(dims)
         self.token = nn.Embedding(vocab, dims)
         self.audio = lambda length, dims, max_tscale: sinusoids(length, dims, max_tscale)
         self.positions = nn.Parameter(torch.empty(ctx, dims), requires_grad=True)
+        self.blend = nn.Parameter(torch.tensor(0.5, device=device, dtype=dtype), requires_grad=True)
         act_fn = get_activation(act)
         self.encoder = nn.Sequential(
             nn.Conv1d(dims, dims, kernel_size=3, stride=1, padding=1, groups=dims), act_fn)
         self.blocka = nn.ModuleList([residual(dims, head, act_fn) for _ in range(layer)])
+        self.blockm = nn.ModuleList([residual(dims, head, act_fn) for _ in range(2)])
+        mask = torch.triu(torch.ones(ctx, ctx), diagonal=1)
         mask = torch.empty(ctx, ctx).fill_(-np.inf).triu_(1)
         self.register_buffer("mask", mask, persistent=False)
+    def forward(self, x, xa, xb, sequential=False, modal=False, kv_cache=None, blend=False) -> Tensor:
         if xa.dim() == 2:
             xa = xa.unsqueeze(0)
             xa = block(xa, mask=None)
             x  = block(x, mask=self.mask)
             x  = block(x, xa, mask=None)
+            if blend:
+                if sequential:
+                    y = x
+                else:
+                    a = torch.sigmoid(self.blend)
+                    x = a * x + (1 - a) * y
         for block in chain(self.blockm or []):
             xm = block(torch.cat([x, xa], dim=1), torch.cat([x, xa], dim=1), mask=None) if modal else None
             x  = block(xm[:, :x.shape[1]], xm[:, x.shape[1]:], mask=None) if modal else x
+            if blend:
+                if sequential:
+                    y = x
+                else:
+                    a = torch.sigmoid(self.blend)
+                    x = a * x + (1 - a) * y
         x = nn.functional.dropout(x, p=0.001, training=self.training)
         x = self.ln(x)
     def adjust_window(self, loss, ctx):
         self.win_size = ((ctx // self.param.head))
         if loss < self.best_loss:
+            win_size = (self.win_size * self.factor)
         else:
             win_size = (self.win_size // self.factor).clamp(0, self.win_size - 1)
         self.win_size = win_size
         self.update(win_size)
         return win_size
+    def forward(self, labels=None, input_ids=None, pitch=None, pitch_tokens=None, spectrogram=None, waveform=None):
         x = input_ids
+        xa = pitch
+        xb = spectrogram
+        enc = {}
+        if spectrogram is not None:
+            enc["spectrogram"] = spectrogram
+        if waveform is not None:
+            enc["waveform"] = waveform
+        if pitch is not None:
+            enc["pitch"] = pitch
+        logits = self.processor(x, xa, xb)
         loss = None
         if labels is not None:
             loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.shape[-1]), labels.view(-1), ignore_index=0)
+        self.adjust_window(loss=loss.item(), ctx=xa.shape[1])
         return {"logits": logits, "loss": loss}
     def _init_weights(self, module):
                 hooks.append(layer.v.register_forward_hook(save_to_cache))
         self.processor.apply(install_hooks)
         return cache, hooks
+### "pipeline"
+def prepare_datasets(tokenizer, token, sanity_check=False, sample_rate=16000, streaming=True, load_saved=False, save_dataset=True, cache_dir='E:/hf', extract_args=None, max_ctx=2048):
+    if load_saved:
+        if cache_dir is None:
+            cache_dir = cache_dir
+        else:
+            cache_dir = cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+        cache_file_train = os.path.join(cache_dir, "train.arrow")
+        cache_file_test = os.path.join(cache_dir, "test.arrow")
+        if os.path.exists(cache_file_train) and os.path.exists(cache_file_test):
+            from datasets import Dataset
+            train_dataset = Dataset.load_from_disk(cache_file_train)
+            test_dataset = Dataset.load_from_disk(cache_file_test)
+            return train_dataset, test_dataset
+    def filter_func(x):
+        return (0 < len(x["transcription"]) < max_ctx and
+                len(x["audio"]["array"]) > 0 and
+                len(x["audio"]["array"]) < max_ctx * 160)
+    raw_train  = load_dataset("mozilla-foundation/common_voice_17_0", "en", token=token, split="train", trust_remote_code=True, streaming=True).rename_column("sentence", "transcription")
+    raw_test = load_dataset("mozilla-foundation/common_voice_17_0", "en", token=token, split="test", trust_remote_code=True, streaming=True).rename_column("sentence", "transcription").take(1000)
+    raw_train = raw_train.filter(filter_func).cast_column("audio", Audio(sampling_rate=sample_rate))
+    raw_test = raw_test.filter(filter_func).cast_column("audio", Audio(sampling_rate=sample_rate))
+    train_dataset = raw_train.map(lambda x: extract_features(x, tokenizer, **extract_args)).remove_columns(["audio", "transcription"])
+    test_dataset = raw_test.map(lambda x: extract_features(x, tokenizer, **extract_args)).remove_columns(["audio", "transcription"])
+    train_dataset.save_to_disk(cache_file_train) if save_dataset is True else None
+    test_dataset.save_to_disk(cache_file_test) if save_dataset is True else None
+    return train_dataset, test_dataset
+def main():
+    token = ""
+    log_dir = os.path.join('D:/newmodel/output/logs/', datetime.now().strftime('%m-%d_%H_%M_%S'))
+    os.makedirs(log_dir, exist_ok=True)
+    tokenizer = setup_tokenizer("D:/newmodel/mod5/tokenizer.json")
+    extract_args = {
+        "waveform": True,
+        "spec": True,
+        "pitch_tokens": True,
+        "pitch": True,
+        "harmonics": False,
+        "aperiodics": False,
+        "phase_mod": False,
+        "crepe": False,
+        "sample_rate": 16000,
+        "hop_length": 256,
+        "mode": "mean",
+        "debug": False,
+    }
+    param = Dimensions(vocab=40000, mels=128, ctx=2048, dims=512, head=4, layer=4, act="swish")
+    train_dataset, test_dataset = prepare_datasets(tokenizer, token, sanity_check=False, sample_rate=16000, streaming=False,
+        load_saved=False, save_dataset=False, cache_dir=None, extract_args=extract_args, max_ctx=param.ctx)
+    model = Model(param).to('cuda')
+    print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
+    print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
+    from functools import partial
+    metrics_fn = partial(compute_metrics, print_pred=True, num_samples=1, tokenizer=tokenizer, model=model)
+    training_args = Seq2SeqTrainingArguments(
+        output_dir=log_dir,
+        per_device_train_batch_size=1,
+        per_device_eval_batch_size=1,
+        max_steps=100000,
+        eval_steps=1000,
+        save_steps=1000,
+        warmup_steps=1000,
+        logging_steps=100,
+        logging_dir=log_dir,
+        logging_strategy="steps",
+        eval_strategy="steps",
+        save_strategy="no",
+        report_to=["tensorboard"],
+        push_to_hub=False,
+        save_total_limit=1,
+        label_names=["labels"],
+        save_safetensors=False,
+        eval_on_start=False,
+        batch_eval_metrics=False,
+        disable_tqdm=False,
+        include_tokens_per_second=True,
+        include_num_input_tokens_seen=True,
+        learning_rate=0.00025,
+        weight_decay=0.025,
+    )
+    optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate, eps=1e-8, weight_decay=training_args.weight_decay, betas=(0.9, 0.999), amsgrad=False, foreach=False, fused=False, capturable=False, differentiable=False, maximize=False)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=training_args.max_steps, eta_min=1e-9, last_epoch=-1)
+    trainer = Seq2SeqTrainer(
+        args=training_args,
+        model=model,
+        train_dataset=train_dataset,
+        eval_dataset=test_dataset,
+        data_collator=DataCollator(tokenizer=tokenizer),
+        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        compute_metrics=metrics_fn,
+        optimizers=(optimizer, scheduler)
+    )
+    model.init_weights()
+    trainer.train()
+if __name__ == "__main__":
+    main()