Spaces:
Running
on
Zero
Running
on
Zero
added torchcrepe
Browse files- app.py +21 -50
- requirements.txt +2 -4
app.py
CHANGED
|
@@ -30,8 +30,7 @@ import torchaudio
|
|
| 30 |
from absl import app
|
| 31 |
from torch.nn.functional import interpolate
|
| 32 |
import logging
|
| 33 |
-
import
|
| 34 |
-
from hmmlearn import hmm
|
| 35 |
import soundfile as sf
|
| 36 |
import pdb
|
| 37 |
from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
|
|
@@ -54,54 +53,23 @@ def debug_profile(func):
|
|
| 54 |
return pp.profile(sort_by='cumulative', out_lines=10)(func)
|
| 55 |
return func
|
| 56 |
|
| 57 |
-
def predict_voicing(confidence):
|
| 58 |
-
# https://github.com/marl/crepe/pull/26
|
| 59 |
-
"""
|
| 60 |
-
Find the Viterbi path for voiced versus unvoiced frames.
|
| 61 |
-
Parameters
|
| 62 |
-
----------
|
| 63 |
-
confidence : np.ndarray [shape=(N,)]
|
| 64 |
-
voicing confidence array, i.e. the confidence in the presence of
|
| 65 |
-
a pitch
|
| 66 |
-
Returns
|
| 67 |
-
-------
|
| 68 |
-
voicing_states : np.ndarray [shape=(N,)]
|
| 69 |
-
HMM predictions for each frames state, 0 if unvoiced, 1 if
|
| 70 |
-
voiced
|
| 71 |
-
"""
|
| 72 |
-
# uniform prior on the voicing confidence
|
| 73 |
-
starting = np.array([0.5, 0.5])
|
| 74 |
-
|
| 75 |
-
# transition probabilities inducing continuous voicing state
|
| 76 |
-
transition = np.array([[0.99, 0.01], [0.01, 0.99]])
|
| 77 |
-
|
| 78 |
-
# mean and variance for unvoiced and voiced states
|
| 79 |
-
means = np.array([[0.0], [1.0]])
|
| 80 |
-
variances = np.array([[0.25], [0.25]])
|
| 81 |
-
|
| 82 |
-
# fix the model parameters because we are not optimizing the model
|
| 83 |
-
model = hmm.GaussianHMM(n_components=2)
|
| 84 |
-
model.startprob_, model.covars_, model.transmat_, model.means_, \
|
| 85 |
-
model.n_features = starting, variances, transition, means, 1
|
| 86 |
-
|
| 87 |
-
# find the Viterbi path
|
| 88 |
-
voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)])
|
| 89 |
-
|
| 90 |
-
return np.array(voicing_states)
|
| 91 |
|
| 92 |
def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
f0 =
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
-
return
|
| 105 |
|
| 106 |
def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4, t0=0.5):
|
| 107 |
'''Generate pitch values for the melodic reinterpretation task'''
|
|
@@ -219,12 +187,15 @@ def container_generate(model_selection, task_selection, audio, singer_id, t0):
|
|
| 219 |
# make sure the audio is at least 4 s long
|
| 220 |
audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
|
| 221 |
audio = audio.astype(np.float32)
|
| 222 |
-
audio /= np.max(np.abs(audio))
|
| 223 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
|
| 224 |
mic_audio = audio.copy()
|
| 225 |
audio = audio[-12*16000:] # consider only last 12 s
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
| 228 |
logging.log(logging.INFO, 'Pitch extracted')
|
| 229 |
f0 = pitch_task_fn(**{
|
| 230 |
'inputs': {
|
|
|
|
| 30 |
from absl import app
|
| 31 |
from torch.nn.functional import interpolate
|
| 32 |
import logging
|
| 33 |
+
import torchcrepe
|
|
|
|
| 34 |
import soundfile as sf
|
| 35 |
import pdb
|
| 36 |
from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
|
|
|
|
| 53 |
return pp.profile(sort_by='cumulative', out_lines=10)(func)
|
| 54 |
return func
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
|
| 58 |
+
if not isinstance(audio, torch.Tensor):
|
| 59 |
+
audio = torch.Tensor(audio).to(device)
|
| 60 |
+
if len(audio.shape) == 1:
|
| 61 |
+
audio = audio.unsqueeze(0)
|
| 62 |
+
hop_length = int(sr * frame_shift_ms / 1000)
|
| 63 |
+
f0 = torchcrepe.predict(audio,
|
| 64 |
+
sr,
|
| 65 |
+
hop_length=hop_length,
|
| 66 |
+
model='tiny',
|
| 67 |
+
device=device,
|
| 68 |
+
fmin=80,
|
| 69 |
+
fmax=800
|
| 70 |
+
)
|
| 71 |
|
| 72 |
+
return f0.squeeze(0)
|
| 73 |
|
| 74 |
def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4, t0=0.5):
|
| 75 |
'''Generate pitch values for the melodic reinterpretation task'''
|
|
|
|
| 187 |
# make sure the audio is at least 4 s long
|
| 188 |
audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
|
| 189 |
audio = audio.astype(np.float32)
|
| 190 |
+
audio /= np.max(np.abs(audio) + np.finfo(float).eps) # normalize audio
|
| 191 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
|
| 192 |
mic_audio = audio.copy()
|
| 193 |
audio = audio[-12*16000:] # consider only last 12 s
|
| 194 |
+
f0 = extract_pitch(audio)
|
| 195 |
+
# move f0 to cpu
|
| 196 |
+
if f0.device != 'cpu': #TODO:
|
| 197 |
+
f0 = f0.cpu()
|
| 198 |
+
mic_f0 = f0.clone() # save the user input pitch values
|
| 199 |
logging.log(logging.INFO, 'Pitch extracted')
|
| 200 |
f0 = pitch_task_fn(**{
|
| 201 |
'inputs': {
|
requirements.txt
CHANGED
|
@@ -1,4 +1,2 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
tensorflow==2.17.0
|
| 4 |
-
GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@055df71380e0feced7e409470ffc8603f1cfa926
|
|
|
|
| 1 |
+
torchcrepe==0.0.23
|
| 2 |
+
GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@8f781e6a580bf2db794bcc813913a2a5e9efde99
|
|
|
|
|
|