Sin2pi
/

asr-model

@@ -25,8 +25,10 @@ tags:
 ASR model + pitch aware relative positional embeddings.
 <img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
 <img width="233" height="77" alt="legend" src="https://github.com/user-attachments/assets/fad84550-a199-43b3-8471-d011a9fd6f94" />
 Questions:
@@ -66,30 +68,54 @@ Here are the abbreviated steps for replacing theta and radius in the rotary forw
 ```python
-if f0 is not None:
-    if f0.dim == 2:
-        f0 = f0.squeeze0
-    theta = f0 + self.theta
-else:
-    theta = self.theta
-freqs = theta.unsqueeze-1  220.0 * 700 *
-    torch.pow10, torch.linspace0, 2595 * torch.log10torch.tensor1 + 8000700,
-            self.dim  2, device=theta.device, dtype=theta.dtype  2595 - 1  1000
-t = torch.arangectx, device=device, dtype=dtype
-freqs = t[:, None] * freqs  # dont repeat or use some other method here
-if self.radii and f0 is not None:
-    radius = f0.todevice, dtype
-    freqs = torch.polarradius.unsqueeze-1, freqs
-else:
-    radius = torch.ones_likefreqs
-    freqs = torch.polarradius, freqs
 ```python
@@ -229,5 +255,3 @@ The Complex Frequency Result:

 ASR model + pitch aware relative positional embeddings.
 <img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
 <img width="233" height="77" alt="legend" src="https://github.com/user-attachments/assets/fad84550-a199-43b3-8471-d011a9fd6f94" />
+https://huggingface.co/Sin2pi/asr-model/tensorboard
 Questions:
 ```python
+     self.theta = nn.Parameter((torch.tensor(220, device=device, dtype=dtype)), requires_grad=True)
+    def theta_freqs(self, theta):
+        if theta.dim() == 0:
+            theta = theta.unsqueeze(0)
+        freq = (theta.unsqueeze(-1)) * 700 * (   # for static theta=220.0
+            torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)),
+                    self.head_dim // 2, device=theta.device, dtype=theta.dtype) / 2595) - 1) / 1000
+        return freq
+    def _apply_radii(self, freqs, f0, ctx):
+        if self.radii and f0 is not None:
+            radius = f0.to(device, dtype)
+            return torch.polar(radius.unsqueeze(-1), freqs), radius
+        else:
+            return torch.polar(torch.ones_like(freqs), freqs), None
+ def compute_pitch_tokens(wav, sample_rate, labels, mode="mean"):
+     import pyworld as pw
+     wavnp = wav.numpy().astype(np.float64)
+     f0_np, t = pw.dio(wavnp, sample_rate, frame_period=hop_length / sample_rate * 1000)
+     f0_np = pw.stonemask(wavnp, f0_np, t, sample_rate)
+     t = torch.from_numpy(t)
+     audio_duration = len(wav) / sample_rate
+     T = len(labels)
+     tok_dur_sec = audio_duration / T
+     token_starts = torch.arange(T) * tok_dur_sec
+     token_ends = token_starts + tok_dur_sec
+     start_idx = torch.searchsorted(t, token_starts, side="left")
+     end_idx = torch.searchsorted(t, token_ends, side="right")
+     pitch_tok = torch.zeros(T, dtype=torch.float32)
+     for i in range(T):
+         lo, hi = start_idx[i], max(start_idx[i]+1, end_idx[i]) # type: ignore
+         segment = f0_np[lo:hi]
+         if mode == "mean":
+             pitch_tok[i] = segment.mean()
+         elif mode == "median":
+             pitch_tok[i] = torch.median(segment)
+         else:
+             pitch_tok[i] = segment[-1]
+     pitch_tok[pitch_tok < 100.0] = 0.0
+     bos_pitch = pitch_tok[0] if len(pitch_tok) > 0 else 0.0
+     f0t_tensor = torch.cat([torch.tensor([bos_pitch]), pitch_tok])
+     f0t_tensor = torch.where(f0t_tensor == 0.0, torch.zeros_like(f0t_tensor), (f0t_tensor - 71.0) / (500.0 - 71.0))
+     return pitch_tokens
 ```python