Update README.md
Browse files
README.md
CHANGED
@@ -25,8 +25,10 @@ tags:
|
|
25 |
ASR model + pitch aware relative positional embeddings.
|
26 |
|
27 |
<img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
|
|
|
28 |
<img width="233" height="77" alt="legend" src="https://github.com/user-attachments/assets/fad84550-a199-43b3-8471-d011a9fd6f94" />
|
29 |
|
|
|
30 |
|
31 |
Questions:
|
32 |
|
@@ -66,30 +68,54 @@ Here are the abbreviated steps for replacing theta and radius in the rotary forw
|
|
66 |
|
67 |
```python
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
|
83 |
-
t = torch.arangectx, device=device, dtype=dtype
|
84 |
-
freqs = t[:, None] * freqs # dont repeat or use some other method here
|
85 |
-
|
86 |
-
if self.radii and f0 is not None:
|
87 |
-
radius = f0.todevice, dtype
|
88 |
-
freqs = torch.polarradius.unsqueeze-1, freqs
|
89 |
-
else:
|
90 |
-
radius = torch.ones_likefreqs
|
91 |
-
freqs = torch.polarradius, freqs
|
92 |
-
|
93 |
|
94 |
```python
|
95 |
|
@@ -229,5 +255,3 @@ The Complex Frequency Result:
|
|
229 |
|
230 |
|
231 |
|
232 |
-
|
233 |
-
|
|
|
25 |
ASR model + pitch aware relative positional embeddings.
|
26 |
|
27 |
<img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
|
28 |
+
|
29 |
<img width="233" height="77" alt="legend" src="https://github.com/user-attachments/assets/fad84550-a199-43b3-8471-d011a9fd6f94" />
|
30 |
|
31 |
+
https://huggingface.co/Sin2pi/asr-model/tensorboard
|
32 |
|
33 |
Questions:
|
34 |
|
|
|
68 |
|
69 |
```python
|
70 |
|
71 |
+
self.theta = nn.Parameter((torch.tensor(220, device=device, dtype=dtype)), requires_grad=True)
|
72 |
+
|
73 |
+
def theta_freqs(self, theta):
|
74 |
+
if theta.dim() == 0:
|
75 |
+
theta = theta.unsqueeze(0)
|
76 |
+
freq = (theta.unsqueeze(-1)) * 700 * ( # for static theta=220.0
|
77 |
+
torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)),
|
78 |
+
self.head_dim // 2, device=theta.device, dtype=theta.dtype) / 2595) - 1) / 1000
|
79 |
+
return freq
|
80 |
+
|
81 |
+
def _apply_radii(self, freqs, f0, ctx):
|
82 |
+
if self.radii and f0 is not None:
|
83 |
+
radius = f0.to(device, dtype)
|
84 |
+
return torch.polar(radius.unsqueeze(-1), freqs), radius
|
85 |
+
else:
|
86 |
+
return torch.polar(torch.ones_like(freqs), freqs), None
|
87 |
+
|
88 |
+
|
89 |
+
def compute_pitch_tokens(wav, sample_rate, labels, mode="mean"):
|
90 |
+
import pyworld as pw
|
91 |
+
wavnp = wav.numpy().astype(np.float64)
|
92 |
+
f0_np, t = pw.dio(wavnp, sample_rate, frame_period=hop_length / sample_rate * 1000)
|
93 |
+
f0_np = pw.stonemask(wavnp, f0_np, t, sample_rate)
|
94 |
+
t = torch.from_numpy(t)
|
95 |
+
audio_duration = len(wav) / sample_rate
|
96 |
+
T = len(labels)
|
97 |
+
tok_dur_sec = audio_duration / T
|
98 |
+
token_starts = torch.arange(T) * tok_dur_sec
|
99 |
+
token_ends = token_starts + tok_dur_sec
|
100 |
+
start_idx = torch.searchsorted(t, token_starts, side="left")
|
101 |
+
end_idx = torch.searchsorted(t, token_ends, side="right")
|
102 |
+
pitch_tok = torch.zeros(T, dtype=torch.float32)
|
103 |
+
for i in range(T):
|
104 |
+
lo, hi = start_idx[i], max(start_idx[i]+1, end_idx[i]) # type: ignore
|
105 |
+
segment = f0_np[lo:hi]
|
106 |
+
if mode == "mean":
|
107 |
+
pitch_tok[i] = segment.mean()
|
108 |
+
elif mode == "median":
|
109 |
+
pitch_tok[i] = torch.median(segment)
|
110 |
+
else:
|
111 |
+
pitch_tok[i] = segment[-1]
|
112 |
+
pitch_tok[pitch_tok < 100.0] = 0.0
|
113 |
+
bos_pitch = pitch_tok[0] if len(pitch_tok) > 0 else 0.0
|
114 |
+
f0t_tensor = torch.cat([torch.tensor([bos_pitch]), pitch_tok])
|
115 |
+
f0t_tensor = torch.where(f0t_tensor == 0.0, torch.zeros_like(f0t_tensor), (f0t_tensor - 71.0) / (500.0 - 71.0))
|
116 |
+
return pitch_tokens
|
117 |
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
```python
|
121 |
|
|
|
255 |
|
256 |
|
257 |
|
|
|
|