Sin2pi commited on
Commit
67c6d40
·
verified ·
1 Parent(s): 93b313e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +48 -24
README.md CHANGED
@@ -25,8 +25,10 @@ tags:
25
  ASR model + pitch aware relative positional embeddings.
26
 
27
  <img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
 
28
  <img width="233" height="77" alt="legend" src="https://github.com/user-attachments/assets/fad84550-a199-43b3-8471-d011a9fd6f94" />
29
 
 
30
 
31
  Questions:
32
 
@@ -66,30 +68,54 @@ Here are the abbreviated steps for replacing theta and radius in the rotary forw
66
 
67
  ```python
68
 
69
-
70
- if f0 is not None:
71
- if f0.dim == 2:
72
- f0 = f0.squeeze0
73
- theta = f0 + self.theta
74
- else:
75
- theta = self.theta
76
-
77
-
78
- freqs = theta.unsqueeze-1 220.0 * 700 *
79
- torch.pow10, torch.linspace0, 2595 * torch.log10torch.tensor1 + 8000700,
80
- self.dim 2, device=theta.device, dtype=theta.dtype 2595 - 1 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
 
83
- t = torch.arangectx, device=device, dtype=dtype
84
- freqs = t[:, None] * freqs # dont repeat or use some other method here
85
-
86
- if self.radii and f0 is not None:
87
- radius = f0.todevice, dtype
88
- freqs = torch.polarradius.unsqueeze-1, freqs
89
- else:
90
- radius = torch.ones_likefreqs
91
- freqs = torch.polarradius, freqs
92
-
93
 
94
  ```python
95
 
@@ -229,5 +255,3 @@ The Complex Frequency Result:
229
 
230
 
231
 
232
-
233
-
 
25
  ASR model + pitch aware relative positional embeddings.
26
 
27
  <img width="1363" height="732" alt="pitch_spectrogram" src="https://github.com/user-attachments/assets/ceb65e94-7df4-41b7-aa3d-c4aa4c6c0717" />
28
+
29
  <img width="233" height="77" alt="legend" src="https://github.com/user-attachments/assets/fad84550-a199-43b3-8471-d011a9fd6f94" />
30
 
31
+ https://huggingface.co/Sin2pi/asr-model/tensorboard
32
 
33
  Questions:
34
 
 
68
 
69
  ```python
70
 
71
+ self.theta = nn.Parameter((torch.tensor(220, device=device, dtype=dtype)), requires_grad=True)
72
+
73
+ def theta_freqs(self, theta):
74
+ if theta.dim() == 0:
75
+ theta = theta.unsqueeze(0)
76
+ freq = (theta.unsqueeze(-1)) * 700 * ( # for static theta=220.0
77
+ torch.pow(10, torch.linspace(0, 2595 * torch.log10(torch.tensor(1 + 8000/700)),
78
+ self.head_dim // 2, device=theta.device, dtype=theta.dtype) / 2595) - 1) / 1000
79
+ return freq
80
+
81
+ def _apply_radii(self, freqs, f0, ctx):
82
+ if self.radii and f0 is not None:
83
+ radius = f0.to(device, dtype)
84
+ return torch.polar(radius.unsqueeze(-1), freqs), radius
85
+ else:
86
+ return torch.polar(torch.ones_like(freqs), freqs), None
87
+
88
+
89
+ def compute_pitch_tokens(wav, sample_rate, labels, mode="mean"):
90
+ import pyworld as pw
91
+ wavnp = wav.numpy().astype(np.float64)
92
+ f0_np, t = pw.dio(wavnp, sample_rate, frame_period=hop_length / sample_rate * 1000)
93
+ f0_np = pw.stonemask(wavnp, f0_np, t, sample_rate)
94
+ t = torch.from_numpy(t)
95
+ audio_duration = len(wav) / sample_rate
96
+ T = len(labels)
97
+ tok_dur_sec = audio_duration / T
98
+ token_starts = torch.arange(T) * tok_dur_sec
99
+ token_ends = token_starts + tok_dur_sec
100
+ start_idx = torch.searchsorted(t, token_starts, side="left")
101
+ end_idx = torch.searchsorted(t, token_ends, side="right")
102
+ pitch_tok = torch.zeros(T, dtype=torch.float32)
103
+ for i in range(T):
104
+ lo, hi = start_idx[i], max(start_idx[i]+1, end_idx[i]) # type: ignore
105
+ segment = f0_np[lo:hi]
106
+ if mode == "mean":
107
+ pitch_tok[i] = segment.mean()
108
+ elif mode == "median":
109
+ pitch_tok[i] = torch.median(segment)
110
+ else:
111
+ pitch_tok[i] = segment[-1]
112
+ pitch_tok[pitch_tok < 100.0] = 0.0
113
+ bos_pitch = pitch_tok[0] if len(pitch_tok) > 0 else 0.0
114
+ f0t_tensor = torch.cat([torch.tensor([bos_pitch]), pitch_tok])
115
+ f0t_tensor = torch.where(f0t_tensor == 0.0, torch.zeros_like(f0t_tensor), (f0t_tensor - 71.0) / (500.0 - 71.0))
116
+ return pitch_tokens
117
 
118
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  ```python
121
 
 
255
 
256
 
257