dodo12

Runtime error

App Files Files Community

pengdaqian commited on May 12, 2023

Commit

2e2adc3

1 Parent(s): 3f4fdab

fix

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +43 -16
requirements.txt +0 -1
whisper/inference.py +3 -2

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .idea/

app.py CHANGED Viewed

@@ -1,20 +1,24 @@
 from music.search import get_random_spit, get_albums
 from vits.models import SynthesizerInfer
 from omegaconf import OmegaConf
 import torchcrepe
 import torch
 import io
-import os
 import gradio as gr
 import librosa
 import numpy as np
 import soundfile
 import random
-from audio2numpy import open_audio
 from spleeter.separator import Separator
 from spleeter.audio.adapter import AudioAdapter
 from pydub import AudioSegment
 import scipy.io.wavfile
 import logging
@@ -84,11 +88,13 @@ model.eval()
 model.to(device)
 separator = Separator('spleeter:2stems')
 audio_loader = AudioAdapter.default()
 def svc_change(argswave, argsspk):
     argsppg = "svc_tmp.ppg.npy"
-    os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
     spk = np.load(argsspk)
     spk = torch.FloatTensor(spk)
@@ -120,16 +126,16 @@ def svc_change(argswave, argsspk):
         out_audio = []
         has_audio = False
-        while (out_index + out_chunk < all_frame):
             has_audio = True
-            if (out_index == 0):  # start frame
                 cut_s = out_index
                 cut_s_48k = 0
             else:
                 cut_s = out_index - hop_frame
                 cut_s_48k = hop_frame * hop_size
-            if (out_index + out_chunk + hop_frame > all_frame):  # end frame
                 cut_e = out_index + out_chunk
                 cut_e_48k = 0
             else:
@@ -148,8 +154,8 @@ def svc_change(argswave, argsspk):
             out_audio.extend(sub_out)
             out_index = out_index + out_chunk
-        if (out_index < all_frame):
-            if (has_audio):
                 cut_s = out_index - hop_frame
                 cut_s_48k = hop_frame * hop_size
             else:
@@ -177,23 +183,40 @@ def np_to_audio_segment(fp_arr):
     return sound
 def svc_main(sid, input_audio):
     if input_audio is None:
         return "You need to upload an audio", None
     sampling_rate, audio = input_audio
-    input_audio_tmp_file = 'origin.wav'
     #
     # prediction = separator.separate(audio)
     # vocals, accompaniment = prediction["vocals"], prediction["accompaniment"]
     soundfile.write(input_audio_tmp_file, audio, sampling_rate, format="wav")
-    separator.separate_to_file(input_audio_tmp_file, '')
-    vocals_filepath = os.path.join(os.path.splitext(input_audio_tmp_file)[0], 'vocals.wav')
-    accompaniment_filepath = os.path.join(os.path.splitext(input_audio_tmp_file)[0], 'accompaniment.wav')
     vocals, sampling_rate = soundfile.read(vocals_filepath)
-    vocals = (vocals / np.iinfo(vocals.dtype).max).astype(np.float32)
     if len(vocals.shape) > 1:
         vocals = librosa.to_mono(vocals.transpose(1, 0))
     if sampling_rate != 16000:
@@ -204,7 +227,7 @@ def svc_main(sid, input_audio):
     soundfile.write(wav_path, vocals, 16000, format="wav")
     out_vocals = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
-    out_vocals_filepath = os.path.join(os.path.splitext(input_audio_tmp_file)[0], 'out_vocals.wav')
     soundfile.write(out_vocals_filepath, out_vocals, 48000, format="wav")
     sound1 = AudioSegment.from_file(out_vocals_filepath)
@@ -212,7 +235,11 @@ def svc_main(sid, input_audio):
     played_togther = sound1.overlay(sound2)
-    return "Success", (48000, played_togther)
 def auto_search(name):
@@ -221,7 +248,7 @@ def auto_search(name):
     album = random.choice(albums)
     save_path = get_random_spit(album)
     fp = save_path
-    signal, sampling_rate = open_audio(fp)
     return sampling_rate, signal

+import os
+os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 from music.search import get_random_spit, get_albums
 from vits.models import SynthesizerInfer
+import whisper.inference
 from omegaconf import OmegaConf
 import torchcrepe
 import torch
 import io
 import gradio as gr
 import librosa
 import numpy as np
 import soundfile
 import random
 from spleeter.separator import Separator
 from spleeter.audio.adapter import AudioAdapter
 from pydub import AudioSegment
 import scipy.io.wavfile
+import uuid
 import logging
 model.to(device)
 separator = Separator('spleeter:2stems')
 audio_loader = AudioAdapter.default()
+whisper_model = whisper.inference.load_model(os.path.join("whisper_pretrain", "medium.pt"))
 def svc_change(argswave, argsspk):
     argsppg = "svc_tmp.ppg.npy"
+    whisper.inference.pred_ppg(whisper_model, argswave, argsppg)
+    # os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
     spk = np.load(argsspk)
     spk = torch.FloatTensor(spk)
         out_audio = []
         has_audio = False
+        while out_index + out_chunk < all_frame:
             has_audio = True
+            if out_index == 0:  # start frame
                 cut_s = out_index
                 cut_s_48k = 0
             else:
                 cut_s = out_index - hop_frame
                 cut_s_48k = hop_frame * hop_size
+            if out_index + out_chunk + hop_frame > all_frame:  # end frame
                 cut_e = out_index + out_chunk
                 cut_e_48k = 0
             else:
             out_audio.extend(sub_out)
             out_index = out_index + out_chunk
+        if out_index < all_frame:
+            if has_audio:
                 cut_s = out_index - hop_frame
                 cut_s_48k = hop_frame * hop_size
             else:
     return sound
+def get_dtype_max_value(dtype):
+    if np.issubdtype(dtype, np.integer):
+        info = np.iinfo(dtype)
+        return info.max
+    elif np.issubdtype(dtype, np.floating):
+        info = np.finfo(dtype)
+        return info.max
+    else:
+        raise ValueError("不支持的 dtype 类型")
 def svc_main(sid, input_audio):
     if input_audio is None:
         return "You need to upload an audio", None
     sampling_rate, audio = input_audio
+    uuid_value = uuid.uuid4()
+    uuid_string = str(uuid_value)
+    input_audio_tmp_file = f'{uuid_string}.wav'
+    tmpfile_path = '/tmp'
     #
     # prediction = separator.separate(audio)
     # vocals, accompaniment = prediction["vocals"], prediction["accompaniment"]
     soundfile.write(input_audio_tmp_file, audio, sampling_rate, format="wav")
+    if not os.path.exists(tmpfile_path):
+        os.makedirs(tmpfile_path)
+    separator.separate_to_file(input_audio_tmp_file, tmpfile_path)
+    curr_tmp_path = os.path.join(tmpfile_path, os.path.splitext(input_audio_tmp_file)[0])
+    vocals_filepath = os.path.join(curr_tmp_path, 'vocals.wav')
+    accompaniment_filepath = os.path.join(curr_tmp_path, 'accompaniment.wav')
     vocals, sampling_rate = soundfile.read(vocals_filepath)
     if len(vocals.shape) > 1:
         vocals = librosa.to_mono(vocals.transpose(1, 0))
     if sampling_rate != 16000:
     soundfile.write(wav_path, vocals, 16000, format="wav")
     out_vocals = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
+    out_vocals_filepath = os.path.join(curr_tmp_path, 'out_vocals.wav')
     soundfile.write(out_vocals_filepath, out_vocals, 48000, format="wav")
     sound1 = AudioSegment.from_file(out_vocals_filepath)
     played_togther = sound1.overlay(sound2)
+    result_path = os.path.join(curr_tmp_path, 'out_song.wav')
+    played_togther.export(result_path, format="wav")
+    result, sampling_rate = soundfile.read(result_path)
+    return "Success", (sampling_rate, result)
 def auto_search(name):
     album = random.choice(albums)
     save_path = get_random_spit(album)
     fp = save_path
+    signal, sampling_rate = soundfile.read(fp)
     return sampling_rate, signal

requirements.txt CHANGED Viewed

@@ -14,5 +14,4 @@ tqdm
 librosa
 pydub
 musicdl
-audio2numpy
 spleeter

 librosa
 pydub
 musicdl
 spleeter

whisper/inference.py CHANGED Viewed

@@ -21,7 +21,7 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
     audln = audio.shape[0]
     ppg_a = []
     idx_s = 0
-    while (idx_s + 25 * 16000 < audln):
         short = audio[idx_s:idx_s + 25 * 16000]
         idx_s = idx_s + 25 * 16000
         ppgln = 25 * 16000 // 320
@@ -31,7 +31,7 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
             ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
             ppg = ppg[:ppgln,]  # [length, dim=1024]
             ppg_a.extend(ppg)
-    if (idx_s < audln):
         short = audio[idx_s:audln]
         ppgln = (audln - idx_s) // 320
         # short = pad_or_trim(short)
@@ -48,6 +48,7 @@ if __name__ == "__main__":
     parser.description = 'please enter embed parameter ...'
     parser.add_argument("-w", "--wav", help="wav", dest="wav")
     parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
     args = parser.parse_args()
     print(args.wav)
     print(args.ppg)

     audln = audio.shape[0]
     ppg_a = []
     idx_s = 0
+    while idx_s + 25 * 16000 < audln:
         short = audio[idx_s:idx_s + 25 * 16000]
         idx_s = idx_s + 25 * 16000
         ppgln = 25 * 16000 // 320
             ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
             ppg = ppg[:ppgln,]  # [length, dim=1024]
             ppg_a.extend(ppg)
+    if idx_s < audln:
         short = audio[idx_s:audln]
         ppgln = (audln - idx_s) // 320
         # short = pad_or_trim(short)
     parser.description = 'please enter embed parameter ...'
     parser.add_argument("-w", "--wav", help="wav", dest="wav")
     parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
     args = parser.parse_args()
     print(args.wav)
     print(args.ppg)