Spaces:
Running
Running
Update src/infer_pack/predictor/RMVPE.py
Browse files
src/infer_pack/predictor/RMVPE.py
CHANGED
|
@@ -334,28 +334,23 @@ class RMVPE:
|
|
| 334 |
ckpt = torch.load(model_path, map_location="cpu")
|
| 335 |
model.load_state_dict(ckpt)
|
| 336 |
model.eval()
|
| 337 |
-
if is_half
|
| 338 |
model = model.half()
|
| 339 |
self.model = model
|
| 340 |
-
self.resample_kernel = {}
|
| 341 |
self.is_half = is_half
|
| 342 |
-
if device
|
| 343 |
-
|
| 344 |
-
self.
|
| 345 |
-
self.mel_extractor = MelSpectrogram(
|
| 346 |
-
is_half, 128, 16000, 1024, 160, None, 30, 8000
|
| 347 |
-
).to(device)
|
| 348 |
-
self.model = self.model.to(device)
|
| 349 |
cents_mapping = 20 * np.arange(360) + 1997.3794084376191
|
| 350 |
-
self.cents_mapping = np.pad(cents_mapping, (4, 4))
|
| 351 |
|
| 352 |
def mel2hidden(self, mel):
|
| 353 |
with torch.no_grad():
|
| 354 |
n_frames = mel.shape[-1]
|
| 355 |
-
mel = mel.float()
|
| 356 |
-
mel = F.pad(
|
| 357 |
-
|
| 358 |
-
|
| 359 |
hidden = self.model(mel)
|
| 360 |
return hidden[:, :n_frames]
|
| 361 |
|
|
@@ -370,7 +365,7 @@ class RMVPE:
|
|
| 370 |
mel = self.mel_extractor(audio, center=True)
|
| 371 |
hidden = self.mel2hidden(mel)
|
| 372 |
hidden = hidden.squeeze(0).cpu().numpy()
|
| 373 |
-
if self.is_half
|
| 374 |
hidden = hidden.astype("float32")
|
| 375 |
f0 = self.decode(hidden, thred=thred)
|
| 376 |
return f0
|
|
@@ -384,23 +379,23 @@ class RMVPE:
|
|
| 384 |
starts = center - 4
|
| 385 |
ends = center + 5
|
| 386 |
for idx in range(salience.shape[0]):
|
| 387 |
-
todo_salience.append(salience[:, starts[idx]
|
| 388 |
-
todo_cents_mapping.append(self.cents_mapping[starts[idx]
|
| 389 |
todo_salience = np.array(todo_salience)
|
| 390 |
todo_cents_mapping = np.array(todo_cents_mapping)
|
| 391 |
product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
|
| 392 |
weight_sum = np.sum(todo_salience, 1)
|
| 393 |
-
|
| 394 |
maxx = np.max(salience, axis=1)
|
| 395 |
-
|
| 396 |
-
return
|
| 397 |
|
| 398 |
def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
|
| 399 |
audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
|
| 400 |
mel = self.mel_extractor(audio, center=True)
|
| 401 |
hidden = self.mel2hidden(mel)
|
| 402 |
hidden = hidden.squeeze(0).cpu().numpy()
|
| 403 |
-
if self.is_half
|
| 404 |
hidden = hidden.astype("float32")
|
| 405 |
f0 = self.decode(hidden, thred=thred)
|
| 406 |
f0[(f0 < f0_min) | (f0 > f0_max)] = 0
|
|
|
|
| 334 |
ckpt = torch.load(model_path, map_location="cpu")
|
| 335 |
model.load_state_dict(ckpt)
|
| 336 |
model.eval()
|
| 337 |
+
if is_half:
|
| 338 |
model = model.half()
|
| 339 |
self.model = model
|
|
|
|
| 340 |
self.is_half = is_half
|
| 341 |
+
self.device = device if device else "cuda" if torch.cuda.is_available() else "cpu"
|
| 342 |
+
self.mel_extractor = MelSpectrogram(is_half, 128, 16000, 1024, 160, None, 30, 8000).to(self.device)
|
| 343 |
+
self.model = self.model.to(self.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
cents_mapping = 20 * np.arange(360) + 1997.3794084376191
|
| 345 |
+
self.cents_mapping = np.pad(cents_mapping, (4, 4))
|
| 346 |
|
| 347 |
def mel2hidden(self, mel):
|
| 348 |
with torch.no_grad():
|
| 349 |
n_frames = mel.shape[-1]
|
| 350 |
+
mel = mel.float()
|
| 351 |
+
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect")
|
| 352 |
+
if self.is_half:
|
| 353 |
+
mel = mel.half()
|
| 354 |
hidden = self.model(mel)
|
| 355 |
return hidden[:, :n_frames]
|
| 356 |
|
|
|
|
| 365 |
mel = self.mel_extractor(audio, center=True)
|
| 366 |
hidden = self.mel2hidden(mel)
|
| 367 |
hidden = hidden.squeeze(0).cpu().numpy()
|
| 368 |
+
if self.is_half:
|
| 369 |
hidden = hidden.astype("float32")
|
| 370 |
f0 = self.decode(hidden, thred=thred)
|
| 371 |
return f0
|
|
|
|
| 379 |
starts = center - 4
|
| 380 |
ends = center + 5
|
| 381 |
for idx in range(salience.shape[0]):
|
| 382 |
+
todo_salience.append(salience[:, starts[idx]:ends[idx]][idx])
|
| 383 |
+
todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]])
|
| 384 |
todo_salience = np.array(todo_salience)
|
| 385 |
todo_cents_mapping = np.array(todo_cents_mapping)
|
| 386 |
product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
|
| 387 |
weight_sum = np.sum(todo_salience, 1)
|
| 388 |
+
divided = product_sum / weight_sum
|
| 389 |
maxx = np.max(salience, axis=1)
|
| 390 |
+
divided[maxx <= thred] = 0
|
| 391 |
+
return divided
|
| 392 |
|
| 393 |
def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
|
| 394 |
audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
|
| 395 |
mel = self.mel_extractor(audio, center=True)
|
| 396 |
hidden = self.mel2hidden(mel)
|
| 397 |
hidden = hidden.squeeze(0).cpu().numpy()
|
| 398 |
+
if self.is_half:
|
| 399 |
hidden = hidden.astype("float32")
|
| 400 |
f0 = self.decode(hidden, thred=thred)
|
| 401 |
f0[(f0 < f0_min) | (f0 > f0_max)] = 0
|