Spaces:

kevinwang676
/

GPT-SoVITS-VC

Runtime error

kevinwang676 commited on Jul 15, 2024

Commit

f1e44ab

verified ·

1 Parent(s): 072e72d

Update GPT_SoVITS/app_colab.py

Files changed (1) hide show

GPT_SoVITS/app_colab.py CHANGED Viewed

@@ -140,7 +140,7 @@ else:
 def change_sovits_weights(sovits_path):
     global vq_model, hps
-    dict_s2 = torch.load(sovits_path, map_location="cpu")
     hps = dict_s2["config"]
     hps = DictToAttrRecursive(hps)
     hps.model.semantic_frame_rate = "25hz"
@@ -168,7 +168,7 @@ change_sovits_weights(sovits_path)
 def change_gpt_weights(gpt_path):
     global hz, max_sec, t2s_model, config
     hz = 50
-    dict_s1 = torch.load(gpt_path, map_location="cpu")
     config = dict_s1["config"]
     max_sec = config["data"]["max_sec"]
     t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
@@ -426,16 +426,20 @@ def vc_main(wav_path, text, language, prompt_wav, noise_scale=0.5):
     phones, word2ph, norm_text = get_cleaned_text_final(text, language)
     spec = get_spepc(hps, prompt_wav)
-    codes = get_code_from_wav(wav_path)[None, None]  # 必须是 3D, [n_q, B, T]
     ge = vq_model.ref_enc(spec)  # [B, D, T/1]
     quantized = vq_model.quantizer.decode(codes)  # [B, D, T]
     if hps.model.semantic_frame_rate == "25hz":
         quantized = F.interpolate(
             quantized, size=int(quantized.shape[-1] * 2), mode="nearest"
         )
     _, m_p, logs_p, y_mask = vq_model.enc_p(
-        quantized, torch.LongTensor([quantized.shape[-1]]),
-        torch.LongTensor(phones)[None], torch.LongTensor([len(phones)]), ge
     )
     z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
     z = vq_model.flow(z_p, y_mask, g=ge, reverse=True)

 def change_sovits_weights(sovits_path):
     global vq_model, hps
+    dict_s2 = torch.load(sovits_path)
     hps = dict_s2["config"]
     hps = DictToAttrRecursive(hps)
     hps.model.semantic_frame_rate = "25hz"
 def change_gpt_weights(gpt_path):
     global hz, max_sec, t2s_model, config
     hz = 50
+    dict_s1 = torch.load(gpt_path)
     config = dict_s1["config"]
     max_sec = config["data"]["max_sec"]
     t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
     phones, word2ph, norm_text = get_cleaned_text_final(text, language)
     spec = get_spepc(hps, prompt_wav)
+    spec = spec.to(device)
+    codes = get_code_from_wav(wav_path)[None, None].to(device)  # 必须是 3D, [n_q, B, T]
     ge = vq_model.ref_enc(spec)  # [B, D, T/1]
     quantized = vq_model.quantizer.decode(codes)  # [B, D, T]
     if hps.model.semantic_frame_rate == "25hz":
         quantized = F.interpolate(
             quantized, size=int(quantized.shape[-1] * 2), mode="nearest"
         )
+    lengths_tensor = torch.LongTensor([quantized.shape[-1]]).to(device)
+    phones_tensor = torch.LongTensor(phones)[None].to(device)
+    phones_lengths_tensor = torch.LongTensor([len(phones)]).to(device)
     _, m_p, logs_p, y_mask = vq_model.enc_p(
+        quantized, lengths_tensor, phones_tensor, phones_lengths_tensor, ge
     )
     z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
     z = vq_model.flow(z_p, y_mask, g=ge, reverse=True)