|
import sys,os |
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from vits.models import SynthesizerInfer |
|
from omegaconf import OmegaConf |
|
import torchcrepe |
|
import torch |
|
import io |
|
import os |
|
import gradio as gr |
|
import librosa |
|
import numpy as np |
|
import soundfile |
|
|
|
import logging |
|
|
|
logging.getLogger('numba').setLevel(logging.WARNING) |
|
logging.getLogger('markdown_it').setLevel(logging.WARNING) |
|
logging.getLogger('urllib3').setLevel(logging.WARNING) |
|
logging.getLogger('matplotlib').setLevel(logging.WARNING) |
|
|
|
|
|
def load_svc_model(checkpoint_path, model): |
|
assert os.path.isfile(checkpoint_path) |
|
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") |
|
saved_state_dict = checkpoint_dict["model_g"] |
|
state_dict = model.state_dict() |
|
new_state_dict = {} |
|
for k, v in state_dict.items(): |
|
new_state_dict[k] = saved_state_dict[k] |
|
model.load_state_dict(new_state_dict) |
|
return model |
|
|
|
|
|
def compute_f0_nn(filename, device): |
|
audio, sr = librosa.load(filename, sr=16000) |
|
assert sr == 16000 |
|
|
|
audio = torch.tensor(np.copy(audio))[None] |
|
|
|
hop_length = 320 |
|
|
|
|
|
fmin = 50 |
|
fmax = 1000 |
|
|
|
model = "full" |
|
|
|
batch_size = 512 |
|
|
|
pitch, periodicity = torchcrepe.predict( |
|
audio, |
|
sr, |
|
hop_length, |
|
fmin, |
|
fmax, |
|
model, |
|
batch_size=batch_size, |
|
device=device, |
|
return_periodicity=True, |
|
) |
|
pitch = np.repeat(pitch, 2, -1) |
|
periodicity = np.repeat(periodicity, 2, -1) |
|
|
|
periodicity = torchcrepe.filter.median(periodicity, 9) |
|
pitch = torchcrepe.filter.mean(pitch, 3) |
|
|
|
pitch = pitch.squeeze(0) |
|
return pitch |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
hp = OmegaConf.load("configs/base.yaml") |
|
model = SynthesizerInfer( |
|
hp.data.filter_length // 2 + 1, |
|
hp.data.segment_size // hp.data.hop_length, |
|
hp) |
|
load_svc_model("vits_pretrain/sovits5.0_bigvgan_mix.pth", model) |
|
model.eval() |
|
model.to(device) |
|
|
|
|
|
def svc_change(argswave, argsspk): |
|
|
|
argsppg = "svc_tmp.ppg.npy" |
|
os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}") |
|
argsvec = "svc_tmp.vec.npy" |
|
os.system(f"python hubert/inference.py -w {argswave} -v {argsvec}") |
|
|
|
spk = np.load(argsspk) |
|
spk = torch.FloatTensor(spk) |
|
|
|
ppg = np.load(argsppg) |
|
ppg = np.repeat(ppg, 2, 0) |
|
ppg = torch.FloatTensor(ppg) |
|
|
|
vec = np.load(argsvec) |
|
vec = np.repeat(vec, 2, 0) |
|
vec = torch.FloatTensor(vec) |
|
|
|
pit = compute_f0_nn(argswave, device) |
|
pit = torch.FloatTensor(pit) |
|
|
|
len_pit = pit.size()[0] |
|
len_vec = vec.size()[0] |
|
len_ppg = ppg.size()[0] |
|
len_min = min(len_pit, len_vec) |
|
len_min = min(len_min, len_ppg) |
|
pit = pit[:len_min] |
|
vec = vec[:len_min, :] |
|
ppg = ppg[:len_min, :] |
|
|
|
with torch.no_grad(): |
|
|
|
spk = spk.unsqueeze(0).to(device) |
|
source = pit.unsqueeze(0).to(device) |
|
source = model.pitch2source(source) |
|
|
|
hop_size = hp.data.hop_length |
|
all_frame = len_min |
|
hop_frame = 10 |
|
out_chunk = 2500 |
|
out_index = 0 |
|
out_audio = [] |
|
has_audio = False |
|
|
|
while (out_index + out_chunk < all_frame): |
|
has_audio = True |
|
if (out_index == 0): |
|
cut_s = out_index |
|
cut_s_out = 0 |
|
else: |
|
cut_s = out_index - hop_frame |
|
cut_s_out = hop_frame * hop_size |
|
|
|
if (out_index + out_chunk + hop_frame > all_frame): |
|
cut_e = out_index + out_chunk |
|
cut_e_out = 0 |
|
else: |
|
cut_e = out_index + out_chunk + hop_frame |
|
cut_e_out = -1 * hop_frame * hop_size |
|
|
|
sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device) |
|
sub_vec = vec[cut_s:cut_e, :].unsqueeze(0).to(device) |
|
sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device) |
|
sub_len = torch.LongTensor([cut_e - cut_s]).to(device) |
|
sub_har = source[:, :, cut_s * |
|
hop_size:cut_e * hop_size].to(device) |
|
sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har) |
|
sub_out = sub_out[0, 0].data.cpu().detach().numpy() |
|
|
|
sub_out = sub_out[cut_s_out:cut_e_out] |
|
out_audio.extend(sub_out) |
|
out_index = out_index + out_chunk |
|
|
|
if (out_index < all_frame): |
|
if (has_audio): |
|
cut_s = out_index - hop_frame |
|
cut_s_out = hop_frame * hop_size |
|
else: |
|
cut_s = 0 |
|
cut_s_out = 0 |
|
sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device) |
|
sub_vec = vec[cut_s:, :].unsqueeze(0).to(device) |
|
sub_pit = pit[cut_s:].unsqueeze(0).to(device) |
|
sub_len = torch.LongTensor([all_frame - cut_s]).to(device) |
|
sub_har = source[:, :, cut_s * hop_size:].to(device) |
|
sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har) |
|
sub_out = sub_out[0, 0].data.cpu().detach().numpy() |
|
|
|
sub_out = sub_out[cut_s_out:] |
|
out_audio.extend(sub_out) |
|
out_audio = np.asarray(out_audio) |
|
|
|
return out_audio |
|
|
|
|
|
def svc_main(sid, input_audio): |
|
if input_audio is None: |
|
return "You need to upload an audio", None |
|
sampling_rate, audio = input_audio |
|
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) |
|
if len(audio.shape) > 1: |
|
audio = librosa.to_mono(audio.transpose(1, 0)) |
|
if sampling_rate != 16000: |
|
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) |
|
if (len(audio) > 16000*100): |
|
audio = audio[:16000*100] |
|
wav_path = "temp.wav" |
|
soundfile.write(wav_path, audio, 16000, format="wav") |
|
out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy") |
|
return "Success", (32000, out_audio) |
|
|
|
|
|
app = gr.Blocks() |
|
with app: |
|
with gr.Tabs(): |
|
with gr.TabItem("sovits 5.0"): |
|
gr.Markdown(value=""" |
|
基于开源数据:Multi-Singer |
|
|
|
https://github.com/Multi-Singer/Multi-Singer.github.io |
|
|
|
最终版本: |
|
|
|
1,mix_encoder: whisper + hubert, 解决跨语言转换和纯对白语音训练 |
|
|
|
2,解决F0瑕疵 |
|
|
|
""") |
|
sid = gr.Dropdown(label="音色", choices=[ |
|
"22", "33", "47", "51"], value="47") |
|
vc_input3 = gr.Audio(label="上传音频") |
|
vc_submit = gr.Button("转换", variant="primary") |
|
vc_output1 = gr.Textbox(label="状态信息") |
|
vc_output2 = gr.Audio(label="转换音频") |
|
vc_submit.click(svc_main, [sid, vc_input3], [vc_output1, vc_output2]) |
|
|
|
app.launch() |
|
|