Spaces:

snnithya
/

GaMaDHaNi

Running on Zero

App Files Files Community

snnithya commited on Dec 8, 2024

Commit

4fc6f5b

1 Parent(s): 7347979

added torchcrepe

Browse files

Files changed (2) hide show

app.py +21 -50
requirements.txt +2 -4

app.py CHANGED Viewed

@@ -30,8 +30,7 @@ import torchaudio
 from absl import app
 from torch.nn.functional import interpolate
 import logging
-import crepe
-from hmmlearn import hmm
 import soundfile as sf
 import pdb
 from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
@@ -54,54 +53,23 @@ def debug_profile(func):
         return pp.profile(sort_by='cumulative', out_lines=10)(func)
     return func
-def predict_voicing(confidence):
-    # https://github.com/marl/crepe/pull/26
-    """
-    Find the Viterbi path for voiced versus unvoiced frames.
-    Parameters
-    ----------
-    confidence : np.ndarray [shape=(N,)]
-        voicing confidence array, i.e. the confidence in the presence of
-        a pitch
-    Returns
-    -------
-    voicing_states : np.ndarray [shape=(N,)]
-        HMM predictions for each frames state, 0 if unvoiced, 1 if
-        voiced
-    """
-    # uniform prior on the voicing confidence
-    starting = np.array([0.5, 0.5])
-    # transition probabilities inducing continuous voicing state
-    transition = np.array([[0.99, 0.01], [0.01, 0.99]])
-    # mean and variance for unvoiced and voiced states
-    means = np.array([[0.0], [1.0]])
-    variances = np.array([[0.25], [0.25]])
-    # fix the model parameters because we are not optimizing the model
-    model = hmm.GaussianHMM(n_components=2)
-    model.startprob_, model.covars_, model.transmat_, model.means_, \
-    model.n_features = starting, variances, transition, means, 1
-    # find the Viterbi path
-    voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)])
-    return np.array(voicing_states)
 def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
-    time, frequency, confidence, _ = crepe.predict(
-      audio, sr=sr,
-      viterbi=True,
-      step_size=frame_shift_ms,
-      verbose=0 if not log else 1)
-    f0 = frequency
-    if unvoice:
-      is_voiced = predict_voicing(confidence)
-      frequency_unvoiced = frequency * is_voiced
-      f0 = frequency_unvoiced
-    return time, f0, confidence
 def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4, t0=0.5):
     '''Generate pitch values for the melodic reinterpretation task'''
@@ -219,12 +187,15 @@ def container_generate(model_selection, task_selection, audio, singer_id, t0):
         # make sure the audio is at least 4 s long
         audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
     audio = audio.astype(np.float32)
-    audio /= np.max(np.abs(audio))
     audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
     mic_audio = audio.copy()
     audio = audio[-12*16000:] # consider only last 12 s
-    _, f0, _ = extract_pitch(audio)
-    mic_f0 = f0.copy() # save the user input pitch values
     logging.log(logging.INFO, 'Pitch extracted')
     f0 = pitch_task_fn(**{
         'inputs': {

 from absl import app
 from torch.nn.functional import interpolate
 import logging
+import torchcrepe
 import soundfile as sf
 import pdb
 from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
         return pp.profile(sort_by='cumulative', out_lines=10)(func)
     return func
 def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
+    if not isinstance(audio, torch.Tensor):
+        audio = torch.Tensor(audio).to(device)
+    if len(audio.shape) == 1:
+        audio = audio.unsqueeze(0)
+    hop_length = int(sr * frame_shift_ms / 1000)
+    f0 = torchcrepe.predict(audio,
+                            sr,
+                            hop_length=hop_length,
+                            model='tiny',
+                            device=device,
+                            fmin=80,
+                            fmax=800
+                            )
+    return f0.squeeze(0)
 def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4, t0=0.5):
     '''Generate pitch values for the melodic reinterpretation task'''
         # make sure the audio is at least 4 s long
         audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
     audio = audio.astype(np.float32)
+    audio /= np.max(np.abs(audio) + np.finfo(float).eps) # normalize audio
     audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
     mic_audio = audio.copy()
     audio = audio[-12*16000:] # consider only last 12 s
+    f0 = extract_pitch(audio)
+    # move f0 to cpu
+    if f0.device != 'cpu':  #TODO:
+        f0 = f0.cpu()
+    mic_f0 = f0.clone() # save the user input pitch values
     logging.log(logging.INFO, 'Pitch extracted')
     f0 = pitch_task_fn(**{
         'inputs': {

requirements.txt CHANGED Viewed

@@ -1,4 +1,2 @@
-crepe==0.0.15
-hmmlearn==0.3.2
-tensorflow==2.17.0
-GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@055df71380e0feced7e409470ffc8603f1cfa926


1	+ torchcrepe==0.0.23
2	+ GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@8f781e6a580bf2db794bcc813913a2a5e9efde99