Bredvige commited on
Commit
090c24f
·
verified ·
1 Parent(s): 1981948

Update libs/rmvpe.py

Browse files
Files changed (1) hide show
  1. libs/rmvpe.py +670 -670
libs/rmvpe.py CHANGED
@@ -1,670 +1,670 @@
1
- from io import BytesIO
2
- import os
3
- from typing import List, Optional, Tuple
4
- import numpy as np
5
- import torch
6
-
7
- from infer.lib import jit
8
-
9
- try:
10
- # Fix "Torch not compiled with CUDA enabled"
11
- import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
12
-
13
- if torch.xpu.is_available():
14
- from infer.modules.ipex import ipex_init
15
-
16
- ipex_init()
17
- except Exception: # pylint: disable=broad-exception-caught
18
- pass
19
- import torch.nn as nn
20
- import torch.nn.functional as F
21
- from librosa.util import normalize, pad_center, tiny
22
- from scipy.signal import get_window
23
-
24
- import logging
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
-
29
- class STFT(torch.nn.Module):
30
- def __init__(
31
- self, filter_length=1024, hop_length=512, win_length=None, window="hann"
32
- ):
33
- """
34
- This module implements an STFT using 1D convolution and 1D transpose convolutions.
35
- This is a bit tricky so there are some cases that probably won't work as working
36
- out the same sizes before and after in all overlap add setups is tough. Right now,
37
- this code should work with hop lengths that are half the filter length (50% overlap
38
- between frames).
39
-
40
- Keyword Arguments:
41
- filter_length {int} -- Length of filters used (default: {1024})
42
- hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512})
43
- win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
44
- equals the filter length). (default: {None})
45
- window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris)
46
- (default: {'hann'})
47
- """
48
- super(STFT, self).__init__()
49
- self.filter_length = filter_length
50
- self.hop_length = hop_length
51
- self.win_length = win_length if win_length else filter_length
52
- self.window = window
53
- self.forward_transform = None
54
- self.pad_amount = int(self.filter_length / 2)
55
- fourier_basis = np.fft.fft(np.eye(self.filter_length))
56
-
57
- cutoff = int((self.filter_length / 2 + 1))
58
- fourier_basis = np.vstack(
59
- [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
60
- )
61
- forward_basis = torch.FloatTensor(fourier_basis)
62
- inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis))
63
-
64
- assert filter_length >= self.win_length
65
- # get window and zero center pad it to filter_length
66
- fft_window = get_window(window, self.win_length, fftbins=True)
67
- fft_window = pad_center(fft_window, size=filter_length)
68
- fft_window = torch.from_numpy(fft_window).float()
69
-
70
- # window the bases
71
- forward_basis *= fft_window
72
- inverse_basis = (inverse_basis.T * fft_window).T
73
-
74
- self.register_buffer("forward_basis", forward_basis.float())
75
- self.register_buffer("inverse_basis", inverse_basis.float())
76
- self.register_buffer("fft_window", fft_window.float())
77
-
78
- def transform(self, input_data, return_phase=False):
79
- """Take input data (audio) to STFT domain.
80
-
81
- Arguments:
82
- input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
83
-
84
- Returns:
85
- magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
86
- num_frequencies, num_frames)
87
- phase {tensor} -- Phase of STFT with shape (num_batch,
88
- num_frequencies, num_frames)
89
- """
90
- input_data = F.pad(
91
- input_data,
92
- (self.pad_amount, self.pad_amount),
93
- mode="reflect",
94
- )
95
- forward_transform = input_data.unfold(
96
- 1, self.filter_length, self.hop_length
97
- ).permute(0, 2, 1)
98
- forward_transform = torch.matmul(self.forward_basis, forward_transform)
99
- cutoff = int((self.filter_length / 2) + 1)
100
- real_part = forward_transform[:, :cutoff, :]
101
- imag_part = forward_transform[:, cutoff:, :]
102
- magnitude = torch.sqrt(real_part**2 + imag_part**2)
103
- if return_phase:
104
- phase = torch.atan2(imag_part.data, real_part.data)
105
- return magnitude, phase
106
- else:
107
- return magnitude
108
-
109
- def inverse(self, magnitude, phase):
110
- """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced
111
- by the ```transform``` function.
112
-
113
- Arguments:
114
- magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
115
- num_frequencies, num_frames)
116
- phase {tensor} -- Phase of STFT with shape (num_batch,
117
- num_frequencies, num_frames)
118
-
119
- Returns:
120
- inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of
121
- shape (num_batch, num_samples)
122
- """
123
- cat = torch.cat(
124
- [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
125
- )
126
- fold = torch.nn.Fold(
127
- output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length),
128
- kernel_size=(1, self.filter_length),
129
- stride=(1, self.hop_length),
130
- )
131
- inverse_transform = torch.matmul(self.inverse_basis, cat)
132
- inverse_transform = fold(inverse_transform)[
133
- :, 0, 0, self.pad_amount : -self.pad_amount
134
- ]
135
- window_square_sum = (
136
- self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0)
137
- )
138
- window_square_sum = fold(window_square_sum)[
139
- :, 0, 0, self.pad_amount : -self.pad_amount
140
- ]
141
- inverse_transform /= window_square_sum
142
- return inverse_transform
143
-
144
- def forward(self, input_data):
145
- """Take input data (audio) to STFT domain and then back to audio.
146
-
147
- Arguments:
148
- input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
149
-
150
- Returns:
151
- reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of
152
- shape (num_batch, num_samples)
153
- """
154
- self.magnitude, self.phase = self.transform(input_data, return_phase=True)
155
- reconstruction = self.inverse(self.magnitude, self.phase)
156
- return reconstruction
157
-
158
-
159
- from time import time as ttime
160
-
161
-
162
- class BiGRU(nn.Module):
163
- def __init__(self, input_features, hidden_features, num_layers):
164
- super(BiGRU, self).__init__()
165
- self.gru = nn.GRU(
166
- input_features,
167
- hidden_features,
168
- num_layers=num_layers,
169
- batch_first=True,
170
- bidirectional=True,
171
- )
172
-
173
- def forward(self, x):
174
- return self.gru(x)[0]
175
-
176
-
177
- class ConvBlockRes(nn.Module):
178
- def __init__(self, in_channels, out_channels, momentum=0.01):
179
- super(ConvBlockRes, self).__init__()
180
- self.conv = nn.Sequential(
181
- nn.Conv2d(
182
- in_channels=in_channels,
183
- out_channels=out_channels,
184
- kernel_size=(3, 3),
185
- stride=(1, 1),
186
- padding=(1, 1),
187
- bias=False,
188
- ),
189
- nn.BatchNorm2d(out_channels, momentum=momentum),
190
- nn.ReLU(),
191
- nn.Conv2d(
192
- in_channels=out_channels,
193
- out_channels=out_channels,
194
- kernel_size=(3, 3),
195
- stride=(1, 1),
196
- padding=(1, 1),
197
- bias=False,
198
- ),
199
- nn.BatchNorm2d(out_channels, momentum=momentum),
200
- nn.ReLU(),
201
- )
202
- # self.shortcut:Optional[nn.Module] = None
203
- if in_channels != out_channels:
204
- self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
205
-
206
- def forward(self, x: torch.Tensor):
207
- if not hasattr(self, "shortcut"):
208
- return self.conv(x) + x
209
- else:
210
- return self.conv(x) + self.shortcut(x)
211
-
212
-
213
- class Encoder(nn.Module):
214
- def __init__(
215
- self,
216
- in_channels,
217
- in_size,
218
- n_encoders,
219
- kernel_size,
220
- n_blocks,
221
- out_channels=16,
222
- momentum=0.01,
223
- ):
224
- super(Encoder, self).__init__()
225
- self.n_encoders = n_encoders
226
- self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
227
- self.layers = nn.ModuleList()
228
- self.latent_channels = []
229
- for i in range(self.n_encoders):
230
- self.layers.append(
231
- ResEncoderBlock(
232
- in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
233
- )
234
- )
235
- self.latent_channels.append([out_channels, in_size])
236
- in_channels = out_channels
237
- out_channels *= 2
238
- in_size //= 2
239
- self.out_size = in_size
240
- self.out_channel = out_channels
241
-
242
- def forward(self, x: torch.Tensor):
243
- concat_tensors: List[torch.Tensor] = []
244
- x = self.bn(x)
245
- for i, layer in enumerate(self.layers):
246
- t, x = layer(x)
247
- concat_tensors.append(t)
248
- return x, concat_tensors
249
-
250
-
251
- class ResEncoderBlock(nn.Module):
252
- def __init__(
253
- self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
254
- ):
255
- super(ResEncoderBlock, self).__init__()
256
- self.n_blocks = n_blocks
257
- self.conv = nn.ModuleList()
258
- self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
259
- for i in range(n_blocks - 1):
260
- self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
261
- self.kernel_size = kernel_size
262
- if self.kernel_size is not None:
263
- self.pool = nn.AvgPool2d(kernel_size=kernel_size)
264
-
265
- def forward(self, x):
266
- for i, conv in enumerate(self.conv):
267
- x = conv(x)
268
- if self.kernel_size is not None:
269
- return x, self.pool(x)
270
- else:
271
- return x
272
-
273
-
274
- class Intermediate(nn.Module): #
275
- def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
276
- super(Intermediate, self).__init__()
277
- self.n_inters = n_inters
278
- self.layers = nn.ModuleList()
279
- self.layers.append(
280
- ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
281
- )
282
- for i in range(self.n_inters - 1):
283
- self.layers.append(
284
- ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
285
- )
286
-
287
- def forward(self, x):
288
- for i, layer in enumerate(self.layers):
289
- x = layer(x)
290
- return x
291
-
292
-
293
- class ResDecoderBlock(nn.Module):
294
- def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
295
- super(ResDecoderBlock, self).__init__()
296
- out_padding = (0, 1) if stride == (1, 2) else (1, 1)
297
- self.n_blocks = n_blocks
298
- self.conv1 = nn.Sequential(
299
- nn.ConvTranspose2d(
300
- in_channels=in_channels,
301
- out_channels=out_channels,
302
- kernel_size=(3, 3),
303
- stride=stride,
304
- padding=(1, 1),
305
- output_padding=out_padding,
306
- bias=False,
307
- ),
308
- nn.BatchNorm2d(out_channels, momentum=momentum),
309
- nn.ReLU(),
310
- )
311
- self.conv2 = nn.ModuleList()
312
- self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
313
- for i in range(n_blocks - 1):
314
- self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
315
-
316
- def forward(self, x, concat_tensor):
317
- x = self.conv1(x)
318
- x = torch.cat((x, concat_tensor), dim=1)
319
- for i, conv2 in enumerate(self.conv2):
320
- x = conv2(x)
321
- return x
322
-
323
-
324
- class Decoder(nn.Module):
325
- def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
326
- super(Decoder, self).__init__()
327
- self.layers = nn.ModuleList()
328
- self.n_decoders = n_decoders
329
- for i in range(self.n_decoders):
330
- out_channels = in_channels // 2
331
- self.layers.append(
332
- ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
333
- )
334
- in_channels = out_channels
335
-
336
- def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]):
337
- for i, layer in enumerate(self.layers):
338
- x = layer(x, concat_tensors[-1 - i])
339
- return x
340
-
341
-
342
- class DeepUnet(nn.Module):
343
- def __init__(
344
- self,
345
- kernel_size,
346
- n_blocks,
347
- en_de_layers=5,
348
- inter_layers=4,
349
- in_channels=1,
350
- en_out_channels=16,
351
- ):
352
- super(DeepUnet, self).__init__()
353
- self.encoder = Encoder(
354
- in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
355
- )
356
- self.intermediate = Intermediate(
357
- self.encoder.out_channel // 2,
358
- self.encoder.out_channel,
359
- inter_layers,
360
- n_blocks,
361
- )
362
- self.decoder = Decoder(
363
- self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
364
- )
365
-
366
- def forward(self, x: torch.Tensor) -> torch.Tensor:
367
- x, concat_tensors = self.encoder(x)
368
- x = self.intermediate(x)
369
- x = self.decoder(x, concat_tensors)
370
- return x
371
-
372
-
373
- class E2E(nn.Module):
374
- def __init__(
375
- self,
376
- n_blocks,
377
- n_gru,
378
- kernel_size,
379
- en_de_layers=5,
380
- inter_layers=4,
381
- in_channels=1,
382
- en_out_channels=16,
383
- ):
384
- super(E2E, self).__init__()
385
- self.unet = DeepUnet(
386
- kernel_size,
387
- n_blocks,
388
- en_de_layers,
389
- inter_layers,
390
- in_channels,
391
- en_out_channels,
392
- )
393
- self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
394
- if n_gru:
395
- self.fc = nn.Sequential(
396
- BiGRU(3 * 128, 256, n_gru),
397
- nn.Linear(512, 360),
398
- nn.Dropout(0.25),
399
- nn.Sigmoid(),
400
- )
401
- else:
402
- self.fc = nn.Sequential(
403
- nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
404
- )
405
-
406
- def forward(self, mel):
407
- # print(mel.shape)
408
- mel = mel.transpose(-1, -2).unsqueeze(1)
409
- x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
410
- x = self.fc(x)
411
- # print(x.shape)
412
- return x
413
-
414
-
415
- from librosa.filters import mel
416
-
417
-
418
- class MelSpectrogram(torch.nn.Module):
419
- def __init__(
420
- self,
421
- is_half,
422
- n_mel_channels,
423
- sampling_rate,
424
- win_length,
425
- hop_length,
426
- n_fft=None,
427
- mel_fmin=0,
428
- mel_fmax=None,
429
- clamp=1e-5,
430
- ):
431
- super().__init__()
432
- n_fft = win_length if n_fft is None else n_fft
433
- self.hann_window = {}
434
- mel_basis = mel(
435
- sr=sampling_rate,
436
- n_fft=n_fft,
437
- n_mels=n_mel_channels,
438
- fmin=mel_fmin,
439
- fmax=mel_fmax,
440
- htk=True,
441
- )
442
- mel_basis = torch.from_numpy(mel_basis).float()
443
- self.register_buffer("mel_basis", mel_basis)
444
- self.n_fft = win_length if n_fft is None else n_fft
445
- self.hop_length = hop_length
446
- self.win_length = win_length
447
- self.sampling_rate = sampling_rate
448
- self.n_mel_channels = n_mel_channels
449
- self.clamp = clamp
450
- self.is_half = is_half
451
-
452
- def forward(self, audio, keyshift=0, speed=1, center=True):
453
- factor = 2 ** (keyshift / 12)
454
- n_fft_new = int(np.round(self.n_fft * factor))
455
- win_length_new = int(np.round(self.win_length * factor))
456
- hop_length_new = int(np.round(self.hop_length * speed))
457
- keyshift_key = str(keyshift) + "_" + str(audio.device)
458
- if keyshift_key not in self.hann_window:
459
- self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
460
- audio.device
461
- )
462
- if "privateuseone" in str(audio.device):
463
- if not hasattr(self, "stft"):
464
- self.stft = STFT(
465
- filter_length=n_fft_new,
466
- hop_length=hop_length_new,
467
- win_length=win_length_new,
468
- window="hann",
469
- ).to(audio.device)
470
- magnitude = self.stft.transform(audio)
471
- else:
472
- fft = torch.stft(
473
- audio,
474
- n_fft=n_fft_new,
475
- hop_length=hop_length_new,
476
- win_length=win_length_new,
477
- window=self.hann_window[keyshift_key],
478
- center=center,
479
- return_complex=True,
480
- )
481
- magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
482
- if keyshift != 0:
483
- size = self.n_fft // 2 + 1
484
- resize = magnitude.size(1)
485
- if resize < size:
486
- magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
487
- magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
488
- mel_output = torch.matmul(self.mel_basis, magnitude)
489
- if self.is_half == True:
490
- mel_output = mel_output.half()
491
- log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
492
- return log_mel_spec
493
-
494
-
495
- class RMVPE:
496
- def __init__(self, model_path: str, is_half, device=None, use_jit=False):
497
- self.resample_kernel = {}
498
- self.resample_kernel = {}
499
- self.is_half = is_half
500
- if device is None:
501
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
502
- self.device = device
503
- self.mel_extractor = MelSpectrogram(
504
- is_half, 128, 16000, 1024, 160, None, 30, 8000
505
- ).to(device)
506
- if "privateuseone" in str(device):
507
- import onnxruntime as ort
508
-
509
- ort_session = ort.InferenceSession(
510
- "%s/rmvpe.onnx" % os.environ["rmvpe_root"],
511
- providers=["DmlExecutionProvider"],
512
- )
513
- self.model = ort_session
514
- else:
515
- if str(self.device) == "cuda":
516
- self.device = torch.device("cuda:0")
517
-
518
- def get_jit_model():
519
- jit_model_path = model_path.rstrip(".pth")
520
- jit_model_path += ".half.jit" if is_half else ".jit"
521
- reload = False
522
- if os.path.exists(jit_model_path):
523
- ckpt = jit.load(jit_model_path)
524
- model_device = ckpt["device"]
525
- if model_device != str(self.device):
526
- reload = True
527
- else:
528
- reload = True
529
-
530
- if reload:
531
- ckpt = jit.rmvpe_jit_export(
532
- model_path=model_path,
533
- mode="script",
534
- inputs_path=None,
535
- save_path=jit_model_path,
536
- device=device,
537
- is_half=is_half,
538
- )
539
- model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device)
540
- return model
541
-
542
- def get_default_model():
543
- model = E2E(4, 1, (2, 2))
544
- ckpt = torch.load(model_path, map_location="cpu")
545
- model.load_state_dict(ckpt)
546
- model.eval()
547
- if is_half:
548
- model = model.half()
549
- else:
550
- model = model.float()
551
- return model
552
-
553
- if use_jit:
554
- if is_half and "cpu" in str(self.device):
555
- logger.warning(
556
- "Use default rmvpe model. \
557
- Jit is not supported on the CPU for half floating point"
558
- )
559
- self.model = get_default_model()
560
- else:
561
- self.model = get_jit_model()
562
- else:
563
- self.model = get_default_model()
564
-
565
- self.model = self.model.to(device)
566
- cents_mapping = 20 * np.arange(360) + 1997.3794084376191
567
- self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
568
-
569
- def mel2hidden(self, mel):
570
- with torch.no_grad():
571
- n_frames = mel.shape[-1]
572
- n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
573
- if n_pad > 0:
574
- mel = F.pad(mel, (0, n_pad), mode="constant")
575
- if "privateuseone" in str(self.device):
576
- onnx_input_name = self.model.get_inputs()[0].name
577
- onnx_outputs_names = self.model.get_outputs()[0].name
578
- hidden = self.model.run(
579
- [onnx_outputs_names],
580
- input_feed={onnx_input_name: mel.cpu().numpy()},
581
- )[0]
582
- else:
583
- mel = mel.half() if self.is_half else mel.float()
584
- hidden = self.model(mel)
585
- return hidden[:, :n_frames]
586
-
587
- def decode(self, hidden, thred=0.03):
588
- cents_pred = self.to_local_average_cents(hidden, thred=thred)
589
- f0 = 10 * (2 ** (cents_pred / 1200))
590
- f0[f0 == 10] = 0
591
- # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
592
- return f0
593
-
594
- def infer_from_audio(self, audio, thred=0.03):
595
- # torch.cuda.synchronize()
596
- # t0 = ttime()
597
- if not torch.is_tensor(audio):
598
- audio = torch.from_numpy(audio)
599
- mel = self.mel_extractor(
600
- audio.float().to(self.device).unsqueeze(0), center=True
601
- )
602
- # print(123123123,mel.device.type)
603
- # torch.cuda.synchronize()
604
- # t1 = ttime()
605
- hidden = self.mel2hidden(mel)
606
- # torch.cuda.synchronize()
607
- # t2 = ttime()
608
- # print(234234,hidden.device.type)
609
- if "privateuseone" not in str(self.device):
610
- hidden = hidden.squeeze(0).cpu().numpy()
611
- else:
612
- hidden = hidden[0]
613
- if self.is_half == True:
614
- hidden = hidden.astype("float32")
615
-
616
- f0 = self.decode(hidden, thred=thred)
617
- # torch.cuda.synchronize()
618
- # t3 = ttime()
619
- # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
620
- return f0
621
-
622
- def to_local_average_cents(self, salience, thred=0.05):
623
- # t0 = ttime()
624
- center = np.argmax(salience, axis=1) # 帧长#index
625
- salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
626
- # t1 = ttime()
627
- center += 4
628
- todo_salience = []
629
- todo_cents_mapping = []
630
- starts = center - 4
631
- ends = center + 5
632
- for idx in range(salience.shape[0]):
633
- todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
634
- todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
635
- # t2 = ttime()
636
- todo_salience = np.array(todo_salience) # 帧长,9
637
- todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
638
- product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
639
- weight_sum = np.sum(todo_salience, 1) # 帧长
640
- devided = product_sum / weight_sum # 帧长
641
- # t3 = ttime()
642
- maxx = np.max(salience, axis=1) # 帧长
643
- devided[maxx <= thred] = 0
644
- # t4 = ttime()
645
- # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
646
- return devided
647
-
648
-
649
- if __name__ == "__main__":
650
- import librosa
651
- import soundfile as sf
652
-
653
- audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav")
654
- if len(audio.shape) > 1:
655
- audio = librosa.to_mono(audio.transpose(1, 0))
656
- audio_bak = audio.copy()
657
- if sampling_rate != 16000:
658
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
659
- model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt"
660
- thred = 0.03 # 0.01
661
- device = "cuda" if torch.cuda.is_available() else "cpu"
662
- rmvpe = RMVPE(model_path, is_half=False, device=device)
663
- t0 = ttime()
664
- f0 = rmvpe.infer_from_audio(audio, thred=thred)
665
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
666
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
667
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
668
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
669
- t1 = ttime()
670
- logger.info("%s %.2f", f0.shape, t1 - t0)
 
1
+ from io import BytesIO
2
+ import os
3
+ from typing import List, Optional, Tuple
4
+ import numpy as np
5
+ import torch
6
+
7
+ from libs import jit
8
+
9
+ try:
10
+ # Fix "Torch not compiled with CUDA enabled"
11
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
12
+
13
+ if torch.xpu.is_available():
14
+ from infer.modules.ipex import ipex_init
15
+
16
+ ipex_init()
17
+ except Exception: # pylint: disable=broad-exception-caught
18
+ pass
19
+ import torch.nn as nn
20
+ import torch.nn.functional as F
21
+ from librosa.util import normalize, pad_center, tiny
22
+ from scipy.signal import get_window
23
+
24
+ import logging
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class STFT(torch.nn.Module):
30
+ def __init__(
31
+ self, filter_length=1024, hop_length=512, win_length=None, window="hann"
32
+ ):
33
+ """
34
+ This module implements an STFT using 1D convolution and 1D transpose convolutions.
35
+ This is a bit tricky so there are some cases that probably won't work as working
36
+ out the same sizes before and after in all overlap add setups is tough. Right now,
37
+ this code should work with hop lengths that are half the filter length (50% overlap
38
+ between frames).
39
+
40
+ Keyword Arguments:
41
+ filter_length {int} -- Length of filters used (default: {1024})
42
+ hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512})
43
+ win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
44
+ equals the filter length). (default: {None})
45
+ window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris)
46
+ (default: {'hann'})
47
+ """
48
+ super(STFT, self).__init__()
49
+ self.filter_length = filter_length
50
+ self.hop_length = hop_length
51
+ self.win_length = win_length if win_length else filter_length
52
+ self.window = window
53
+ self.forward_transform = None
54
+ self.pad_amount = int(self.filter_length / 2)
55
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
56
+
57
+ cutoff = int((self.filter_length / 2 + 1))
58
+ fourier_basis = np.vstack(
59
+ [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
60
+ )
61
+ forward_basis = torch.FloatTensor(fourier_basis)
62
+ inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis))
63
+
64
+ assert filter_length >= self.win_length
65
+ # get window and zero center pad it to filter_length
66
+ fft_window = get_window(window, self.win_length, fftbins=True)
67
+ fft_window = pad_center(fft_window, size=filter_length)
68
+ fft_window = torch.from_numpy(fft_window).float()
69
+
70
+ # window the bases
71
+ forward_basis *= fft_window
72
+ inverse_basis = (inverse_basis.T * fft_window).T
73
+
74
+ self.register_buffer("forward_basis", forward_basis.float())
75
+ self.register_buffer("inverse_basis", inverse_basis.float())
76
+ self.register_buffer("fft_window", fft_window.float())
77
+
78
+ def transform(self, input_data, return_phase=False):
79
+ """Take input data (audio) to STFT domain.
80
+
81
+ Arguments:
82
+ input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
83
+
84
+ Returns:
85
+ magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
86
+ num_frequencies, num_frames)
87
+ phase {tensor} -- Phase of STFT with shape (num_batch,
88
+ num_frequencies, num_frames)
89
+ """
90
+ input_data = F.pad(
91
+ input_data,
92
+ (self.pad_amount, self.pad_amount),
93
+ mode="reflect",
94
+ )
95
+ forward_transform = input_data.unfold(
96
+ 1, self.filter_length, self.hop_length
97
+ ).permute(0, 2, 1)
98
+ forward_transform = torch.matmul(self.forward_basis, forward_transform)
99
+ cutoff = int((self.filter_length / 2) + 1)
100
+ real_part = forward_transform[:, :cutoff, :]
101
+ imag_part = forward_transform[:, cutoff:, :]
102
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
103
+ if return_phase:
104
+ phase = torch.atan2(imag_part.data, real_part.data)
105
+ return magnitude, phase
106
+ else:
107
+ return magnitude
108
+
109
+ def inverse(self, magnitude, phase):
110
+ """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced
111
+ by the ```transform``` function.
112
+
113
+ Arguments:
114
+ magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
115
+ num_frequencies, num_frames)
116
+ phase {tensor} -- Phase of STFT with shape (num_batch,
117
+ num_frequencies, num_frames)
118
+
119
+ Returns:
120
+ inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of
121
+ shape (num_batch, num_samples)
122
+ """
123
+ cat = torch.cat(
124
+ [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
125
+ )
126
+ fold = torch.nn.Fold(
127
+ output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length),
128
+ kernel_size=(1, self.filter_length),
129
+ stride=(1, self.hop_length),
130
+ )
131
+ inverse_transform = torch.matmul(self.inverse_basis, cat)
132
+ inverse_transform = fold(inverse_transform)[
133
+ :, 0, 0, self.pad_amount : -self.pad_amount
134
+ ]
135
+ window_square_sum = (
136
+ self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0)
137
+ )
138
+ window_square_sum = fold(window_square_sum)[
139
+ :, 0, 0, self.pad_amount : -self.pad_amount
140
+ ]
141
+ inverse_transform /= window_square_sum
142
+ return inverse_transform
143
+
144
+ def forward(self, input_data):
145
+ """Take input data (audio) to STFT domain and then back to audio.
146
+
147
+ Arguments:
148
+ input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
149
+
150
+ Returns:
151
+ reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of
152
+ shape (num_batch, num_samples)
153
+ """
154
+ self.magnitude, self.phase = self.transform(input_data, return_phase=True)
155
+ reconstruction = self.inverse(self.magnitude, self.phase)
156
+ return reconstruction
157
+
158
+
159
+ from time import time as ttime
160
+
161
+
162
+ class BiGRU(nn.Module):
163
+ def __init__(self, input_features, hidden_features, num_layers):
164
+ super(BiGRU, self).__init__()
165
+ self.gru = nn.GRU(
166
+ input_features,
167
+ hidden_features,
168
+ num_layers=num_layers,
169
+ batch_first=True,
170
+ bidirectional=True,
171
+ )
172
+
173
+ def forward(self, x):
174
+ return self.gru(x)[0]
175
+
176
+
177
+ class ConvBlockRes(nn.Module):
178
+ def __init__(self, in_channels, out_channels, momentum=0.01):
179
+ super(ConvBlockRes, self).__init__()
180
+ self.conv = nn.Sequential(
181
+ nn.Conv2d(
182
+ in_channels=in_channels,
183
+ out_channels=out_channels,
184
+ kernel_size=(3, 3),
185
+ stride=(1, 1),
186
+ padding=(1, 1),
187
+ bias=False,
188
+ ),
189
+ nn.BatchNorm2d(out_channels, momentum=momentum),
190
+ nn.ReLU(),
191
+ nn.Conv2d(
192
+ in_channels=out_channels,
193
+ out_channels=out_channels,
194
+ kernel_size=(3, 3),
195
+ stride=(1, 1),
196
+ padding=(1, 1),
197
+ bias=False,
198
+ ),
199
+ nn.BatchNorm2d(out_channels, momentum=momentum),
200
+ nn.ReLU(),
201
+ )
202
+ # self.shortcut:Optional[nn.Module] = None
203
+ if in_channels != out_channels:
204
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
205
+
206
+ def forward(self, x: torch.Tensor):
207
+ if not hasattr(self, "shortcut"):
208
+ return self.conv(x) + x
209
+ else:
210
+ return self.conv(x) + self.shortcut(x)
211
+
212
+
213
+ class Encoder(nn.Module):
214
+ def __init__(
215
+ self,
216
+ in_channels,
217
+ in_size,
218
+ n_encoders,
219
+ kernel_size,
220
+ n_blocks,
221
+ out_channels=16,
222
+ momentum=0.01,
223
+ ):
224
+ super(Encoder, self).__init__()
225
+ self.n_encoders = n_encoders
226
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
227
+ self.layers = nn.ModuleList()
228
+ self.latent_channels = []
229
+ for i in range(self.n_encoders):
230
+ self.layers.append(
231
+ ResEncoderBlock(
232
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
233
+ )
234
+ )
235
+ self.latent_channels.append([out_channels, in_size])
236
+ in_channels = out_channels
237
+ out_channels *= 2
238
+ in_size //= 2
239
+ self.out_size = in_size
240
+ self.out_channel = out_channels
241
+
242
+ def forward(self, x: torch.Tensor):
243
+ concat_tensors: List[torch.Tensor] = []
244
+ x = self.bn(x)
245
+ for i, layer in enumerate(self.layers):
246
+ t, x = layer(x)
247
+ concat_tensors.append(t)
248
+ return x, concat_tensors
249
+
250
+
251
+ class ResEncoderBlock(nn.Module):
252
+ def __init__(
253
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
254
+ ):
255
+ super(ResEncoderBlock, self).__init__()
256
+ self.n_blocks = n_blocks
257
+ self.conv = nn.ModuleList()
258
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
259
+ for i in range(n_blocks - 1):
260
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
261
+ self.kernel_size = kernel_size
262
+ if self.kernel_size is not None:
263
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
264
+
265
+ def forward(self, x):
266
+ for i, conv in enumerate(self.conv):
267
+ x = conv(x)
268
+ if self.kernel_size is not None:
269
+ return x, self.pool(x)
270
+ else:
271
+ return x
272
+
273
+
274
+ class Intermediate(nn.Module): #
275
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
276
+ super(Intermediate, self).__init__()
277
+ self.n_inters = n_inters
278
+ self.layers = nn.ModuleList()
279
+ self.layers.append(
280
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
281
+ )
282
+ for i in range(self.n_inters - 1):
283
+ self.layers.append(
284
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
285
+ )
286
+
287
+ def forward(self, x):
288
+ for i, layer in enumerate(self.layers):
289
+ x = layer(x)
290
+ return x
291
+
292
+
293
+ class ResDecoderBlock(nn.Module):
294
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
295
+ super(ResDecoderBlock, self).__init__()
296
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
297
+ self.n_blocks = n_blocks
298
+ self.conv1 = nn.Sequential(
299
+ nn.ConvTranspose2d(
300
+ in_channels=in_channels,
301
+ out_channels=out_channels,
302
+ kernel_size=(3, 3),
303
+ stride=stride,
304
+ padding=(1, 1),
305
+ output_padding=out_padding,
306
+ bias=False,
307
+ ),
308
+ nn.BatchNorm2d(out_channels, momentum=momentum),
309
+ nn.ReLU(),
310
+ )
311
+ self.conv2 = nn.ModuleList()
312
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
313
+ for i in range(n_blocks - 1):
314
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
315
+
316
+ def forward(self, x, concat_tensor):
317
+ x = self.conv1(x)
318
+ x = torch.cat((x, concat_tensor), dim=1)
319
+ for i, conv2 in enumerate(self.conv2):
320
+ x = conv2(x)
321
+ return x
322
+
323
+
324
+ class Decoder(nn.Module):
325
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
326
+ super(Decoder, self).__init__()
327
+ self.layers = nn.ModuleList()
328
+ self.n_decoders = n_decoders
329
+ for i in range(self.n_decoders):
330
+ out_channels = in_channels // 2
331
+ self.layers.append(
332
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
333
+ )
334
+ in_channels = out_channels
335
+
336
+ def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]):
337
+ for i, layer in enumerate(self.layers):
338
+ x = layer(x, concat_tensors[-1 - i])
339
+ return x
340
+
341
+
342
+ class DeepUnet(nn.Module):
343
+ def __init__(
344
+ self,
345
+ kernel_size,
346
+ n_blocks,
347
+ en_de_layers=5,
348
+ inter_layers=4,
349
+ in_channels=1,
350
+ en_out_channels=16,
351
+ ):
352
+ super(DeepUnet, self).__init__()
353
+ self.encoder = Encoder(
354
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
355
+ )
356
+ self.intermediate = Intermediate(
357
+ self.encoder.out_channel // 2,
358
+ self.encoder.out_channel,
359
+ inter_layers,
360
+ n_blocks,
361
+ )
362
+ self.decoder = Decoder(
363
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
364
+ )
365
+
366
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
367
+ x, concat_tensors = self.encoder(x)
368
+ x = self.intermediate(x)
369
+ x = self.decoder(x, concat_tensors)
370
+ return x
371
+
372
+
373
+ class E2E(nn.Module):
374
+ def __init__(
375
+ self,
376
+ n_blocks,
377
+ n_gru,
378
+ kernel_size,
379
+ en_de_layers=5,
380
+ inter_layers=4,
381
+ in_channels=1,
382
+ en_out_channels=16,
383
+ ):
384
+ super(E2E, self).__init__()
385
+ self.unet = DeepUnet(
386
+ kernel_size,
387
+ n_blocks,
388
+ en_de_layers,
389
+ inter_layers,
390
+ in_channels,
391
+ en_out_channels,
392
+ )
393
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
394
+ if n_gru:
395
+ self.fc = nn.Sequential(
396
+ BiGRU(3 * 128, 256, n_gru),
397
+ nn.Linear(512, 360),
398
+ nn.Dropout(0.25),
399
+ nn.Sigmoid(),
400
+ )
401
+ else:
402
+ self.fc = nn.Sequential(
403
+ nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
404
+ )
405
+
406
+ def forward(self, mel):
407
+ # print(mel.shape)
408
+ mel = mel.transpose(-1, -2).unsqueeze(1)
409
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
410
+ x = self.fc(x)
411
+ # print(x.shape)
412
+ return x
413
+
414
+
415
+ from librosa.filters import mel
416
+
417
+
418
+ class MelSpectrogram(torch.nn.Module):
419
+ def __init__(
420
+ self,
421
+ is_half,
422
+ n_mel_channels,
423
+ sampling_rate,
424
+ win_length,
425
+ hop_length,
426
+ n_fft=None,
427
+ mel_fmin=0,
428
+ mel_fmax=None,
429
+ clamp=1e-5,
430
+ ):
431
+ super().__init__()
432
+ n_fft = win_length if n_fft is None else n_fft
433
+ self.hann_window = {}
434
+ mel_basis = mel(
435
+ sr=sampling_rate,
436
+ n_fft=n_fft,
437
+ n_mels=n_mel_channels,
438
+ fmin=mel_fmin,
439
+ fmax=mel_fmax,
440
+ htk=True,
441
+ )
442
+ mel_basis = torch.from_numpy(mel_basis).float()
443
+ self.register_buffer("mel_basis", mel_basis)
444
+ self.n_fft = win_length if n_fft is None else n_fft
445
+ self.hop_length = hop_length
446
+ self.win_length = win_length
447
+ self.sampling_rate = sampling_rate
448
+ self.n_mel_channels = n_mel_channels
449
+ self.clamp = clamp
450
+ self.is_half = is_half
451
+
452
+ def forward(self, audio, keyshift=0, speed=1, center=True):
453
+ factor = 2 ** (keyshift / 12)
454
+ n_fft_new = int(np.round(self.n_fft * factor))
455
+ win_length_new = int(np.round(self.win_length * factor))
456
+ hop_length_new = int(np.round(self.hop_length * speed))
457
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
458
+ if keyshift_key not in self.hann_window:
459
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
460
+ audio.device
461
+ )
462
+ if "privateuseone" in str(audio.device):
463
+ if not hasattr(self, "stft"):
464
+ self.stft = STFT(
465
+ filter_length=n_fft_new,
466
+ hop_length=hop_length_new,
467
+ win_length=win_length_new,
468
+ window="hann",
469
+ ).to(audio.device)
470
+ magnitude = self.stft.transform(audio)
471
+ else:
472
+ fft = torch.stft(
473
+ audio,
474
+ n_fft=n_fft_new,
475
+ hop_length=hop_length_new,
476
+ win_length=win_length_new,
477
+ window=self.hann_window[keyshift_key],
478
+ center=center,
479
+ return_complex=True,
480
+ )
481
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
482
+ if keyshift != 0:
483
+ size = self.n_fft // 2 + 1
484
+ resize = magnitude.size(1)
485
+ if resize < size:
486
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
487
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
488
+ mel_output = torch.matmul(self.mel_basis, magnitude)
489
+ if self.is_half == True:
490
+ mel_output = mel_output.half()
491
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
492
+ return log_mel_spec
493
+
494
+
495
+ class RMVPE:
496
+ def __init__(self, model_path: str, is_half, device=None, use_jit=False):
497
+ self.resample_kernel = {}
498
+ self.resample_kernel = {}
499
+ self.is_half = is_half
500
+ if device is None:
501
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
502
+ self.device = device
503
+ self.mel_extractor = MelSpectrogram(
504
+ is_half, 128, 16000, 1024, 160, None, 30, 8000
505
+ ).to(device)
506
+ if "privateuseone" in str(device):
507
+ import onnxruntime as ort
508
+
509
+ ort_session = ort.InferenceSession(
510
+ "%s/rmvpe.onnx" % os.environ["rmvpe_root"],
511
+ providers=["DmlExecutionProvider"],
512
+ )
513
+ self.model = ort_session
514
+ else:
515
+ if str(self.device) == "cuda":
516
+ self.device = torch.device("cuda:0")
517
+
518
+ def get_jit_model():
519
+ jit_model_path = model_path.rstrip(".pth")
520
+ jit_model_path += ".half.jit" if is_half else ".jit"
521
+ reload = False
522
+ if os.path.exists(jit_model_path):
523
+ ckpt = jit.load(jit_model_path)
524
+ model_device = ckpt["device"]
525
+ if model_device != str(self.device):
526
+ reload = True
527
+ else:
528
+ reload = True
529
+
530
+ if reload:
531
+ ckpt = jit.rmvpe_jit_export(
532
+ model_path=model_path,
533
+ mode="script",
534
+ inputs_path=None,
535
+ save_path=jit_model_path,
536
+ device=device,
537
+ is_half=is_half,
538
+ )
539
+ model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device)
540
+ return model
541
+
542
+ def get_default_model():
543
+ model = E2E(4, 1, (2, 2))
544
+ ckpt = torch.load(model_path, map_location="cpu")
545
+ model.load_state_dict(ckpt)
546
+ model.eval()
547
+ if is_half:
548
+ model = model.half()
549
+ else:
550
+ model = model.float()
551
+ return model
552
+
553
+ if use_jit:
554
+ if is_half and "cpu" in str(self.device):
555
+ logger.warning(
556
+ "Use default rmvpe model. \
557
+ Jit is not supported on the CPU for half floating point"
558
+ )
559
+ self.model = get_default_model()
560
+ else:
561
+ self.model = get_jit_model()
562
+ else:
563
+ self.model = get_default_model()
564
+
565
+ self.model = self.model.to(device)
566
+ cents_mapping = 20 * np.arange(360) + 1997.3794084376191
567
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
568
+
569
+ def mel2hidden(self, mel):
570
+ with torch.no_grad():
571
+ n_frames = mel.shape[-1]
572
+ n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
573
+ if n_pad > 0:
574
+ mel = F.pad(mel, (0, n_pad), mode="constant")
575
+ if "privateuseone" in str(self.device):
576
+ onnx_input_name = self.model.get_inputs()[0].name
577
+ onnx_outputs_names = self.model.get_outputs()[0].name
578
+ hidden = self.model.run(
579
+ [onnx_outputs_names],
580
+ input_feed={onnx_input_name: mel.cpu().numpy()},
581
+ )[0]
582
+ else:
583
+ mel = mel.half() if self.is_half else mel.float()
584
+ hidden = self.model(mel)
585
+ return hidden[:, :n_frames]
586
+
587
+ def decode(self, hidden, thred=0.03):
588
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
589
+ f0 = 10 * (2 ** (cents_pred / 1200))
590
+ f0[f0 == 10] = 0
591
+ # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
592
+ return f0
593
+
594
+ def infer_from_audio(self, audio, thred=0.03):
595
+ # torch.cuda.synchronize()
596
+ # t0 = ttime()
597
+ if not torch.is_tensor(audio):
598
+ audio = torch.from_numpy(audio)
599
+ mel = self.mel_extractor(
600
+ audio.float().to(self.device).unsqueeze(0), center=True
601
+ )
602
+ # print(123123123,mel.device.type)
603
+ # torch.cuda.synchronize()
604
+ # t1 = ttime()
605
+ hidden = self.mel2hidden(mel)
606
+ # torch.cuda.synchronize()
607
+ # t2 = ttime()
608
+ # print(234234,hidden.device.type)
609
+ if "privateuseone" not in str(self.device):
610
+ hidden = hidden.squeeze(0).cpu().numpy()
611
+ else:
612
+ hidden = hidden[0]
613
+ if self.is_half == True:
614
+ hidden = hidden.astype("float32")
615
+
616
+ f0 = self.decode(hidden, thred=thred)
617
+ # torch.cuda.synchronize()
618
+ # t3 = ttime()
619
+ # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
620
+ return f0
621
+
622
+ def to_local_average_cents(self, salience, thred=0.05):
623
+ # t0 = ttime()
624
+ center = np.argmax(salience, axis=1) # 帧长#index
625
+ salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
626
+ # t1 = ttime()
627
+ center += 4
628
+ todo_salience = []
629
+ todo_cents_mapping = []
630
+ starts = center - 4
631
+ ends = center + 5
632
+ for idx in range(salience.shape[0]):
633
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
634
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
635
+ # t2 = ttime()
636
+ todo_salience = np.array(todo_salience) # 帧长,9
637
+ todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
638
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
639
+ weight_sum = np.sum(todo_salience, 1) # 帧长
640
+ devided = product_sum / weight_sum # 帧长
641
+ # t3 = ttime()
642
+ maxx = np.max(salience, axis=1) # 帧长
643
+ devided[maxx <= thred] = 0
644
+ # t4 = ttime()
645
+ # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
646
+ return devided
647
+
648
+
649
+ if __name__ == "__main__":
650
+ import librosa
651
+ import soundfile as sf
652
+
653
+ audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav")
654
+ if len(audio.shape) > 1:
655
+ audio = librosa.to_mono(audio.transpose(1, 0))
656
+ audio_bak = audio.copy()
657
+ if sampling_rate != 16000:
658
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
659
+ model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt"
660
+ thred = 0.03 # 0.01
661
+ device = "cuda" if torch.cuda.is_available() else "cpu"
662
+ rmvpe = RMVPE(model_path, is_half=False, device=device)
663
+ t0 = ttime()
664
+ f0 = rmvpe.infer_from_audio(audio, thred=thred)
665
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
666
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
667
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
668
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
669
+ t1 = ttime()
670
+ logger.info("%s %.2f", f0.shape, t1 - t0)