Seia-GPT-SoVITS-v2-ProPlus

Sleeping

App Files Files Community

kemuriririn commited on Jun 25

Commit

887e50c

1 Parent(s): 590b29f

update

Browse files

Files changed (13) hide show

config.py +218 -0
inference_webui.py +2 -2
requirements.txt +2 -1
tools/AP_BWE_main/24kto48k/readme.txt +11 -0
tools/AP_BWE_main/LICENSE +21 -0
tools/AP_BWE_main/README.md +91 -0
tools/AP_BWE_main/datasets1/__init__.py +1 -0
tools/AP_BWE_main/datasets1/dataset.py +108 -0
tools/AP_BWE_main/models/__init__.py +1 -0
tools/AP_BWE_main/models/model.py +464 -0
tools/assets.py +70 -0
tools/audio_sr.py +50 -0
weight.json +1 -0

config.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import os
+import re
+import sys
+import torch
+from tools.i18n.i18n import I18nAuto
+i18n = I18nAuto(language=os.environ.get("language", "Auto"))
+pretrained_sovits_name = {
+    "v1": "pretrained_models/s2G488k.pth",
+    "v2": "pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
+    "v3": "pretrained_models/s2Gv3.pth",  ###v3v4还要检查vocoder，算了。。。
+    "v4": "pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
+    "v2Pro": "pretrained_models/v2Pro/s2Gv2Pro.pth",
+    "v2ProPlus": "pretrained_models/v2Pro/s2Gv2ProPlus.pth",
+}
+pretrained_gpt_name = {
+    "v1": "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
+    "v2": "pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
+    "v3": "pretrained_models/s1v3.ckpt",
+    "v4": "pretrained_models/s1v3.ckpt",
+    "v2Pro": "pretrained_models/s1v3.ckpt",
+    "v2ProPlus": "pretrained_models/s1v3.ckpt",
+}
+name2sovits_path = {
+    # i18n("不训练直接推v1底模！"): "pretrained_models/s2G488k.pth",
+    i18n("不训练直接推v2底模！"): "pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
+    # i18n("不训练直接推v3底模！"): "pretrained_models/s2Gv3.pth",
+    # i18n("不训练直接推v4底模！"): "pretrained_models/gsv-v4-pretrained/s2Gv4.pth",
+    i18n("不训练直接推v2Pro底模！"): "pretrained_models/v2Pro/s2Gv2Pro.pth",
+    i18n("不训练直接推v2ProPlus底模！"): "pretrained_models/v2Pro/s2Gv2ProPlus.pth",
+}
+name2gpt_path = {
+    # i18n("不训练直接推v1底模！"):"pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
+    i18n(
+        "不训练直接推v2底模！"
+    ): "pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
+    i18n("不训练直接推v3底模！"): "pretrained_models/s1v3.ckpt",
+}
+SoVITS_weight_root = [
+    "SoVITS_weights",
+    "SoVITS_weights_v2",
+    "SoVITS_weights_v3",
+    "SoVITS_weights_v4",
+    "SoVITS_weights_v2Pro",
+    "SoVITS_weights_v2ProPlus",
+]
+GPT_weight_root = [
+    "GPT_weights",
+    "GPT_weights_v2",
+    "GPT_weights_v3",
+    "GPT_weights_v4",
+    "GPT_weights_v2Pro",
+    "GPT_weights_v2ProPlus",
+]
+SoVITS_weight_version2root = {
+    "v1": "SoVITS_weights",
+    "v2": "SoVITS_weights_v2",
+    "v3": "SoVITS_weights_v3",
+    "v4": "SoVITS_weights_v4",
+    "v2Pro": "SoVITS_weights_v2Pro",
+    "v2ProPlus": "SoVITS_weights_v2ProPlus",
+}
+GPT_weight_version2root = {
+    "v1": "GPT_weights",
+    "v2": "GPT_weights_v2",
+    "v3": "GPT_weights_v3",
+    "v4": "GPT_weights_v4",
+    "v2Pro": "GPT_weights_v2Pro",
+    "v2ProPlus": "GPT_weights_v2ProPlus",
+}
+def custom_sort_key(s):
+    # 使用正则表达式提取字符串中的数字部分和非数字部分
+    parts = re.split("(\d+)", s)
+    # 将数字部分转换为整数，非数字部分保持不变
+    parts = [int(part) if part.isdigit() else part for part in parts]
+    return parts
+def get_weights_names():
+    SoVITS_names = []
+    for key in name2sovits_path:
+        if os.path.exists(name2sovits_path[key]):
+            SoVITS_names.append(key)
+    for path in SoVITS_weight_root:
+        if not os.path.exists(path):
+            continue
+        for name in os.listdir(path):
+            if name.endswith(".pth"):
+                SoVITS_names.append("%s/%s" % (path, name))
+    if not SoVITS_names:
+        SoVITS_names = [""]
+    GPT_names = []
+    for key in name2gpt_path:
+        if os.path.exists(name2gpt_path[key]):
+            GPT_names.append(key)
+    for path in GPT_weight_root:
+        if not os.path.exists(path):
+            continue
+        for name in os.listdir(path):
+            if name.endswith(".ckpt"):
+                GPT_names.append("%s/%s" % (path, name))
+    SoVITS_names = sorted(SoVITS_names, key=custom_sort_key)
+    GPT_names = sorted(GPT_names, key=custom_sort_key)
+    if not GPT_names:
+        GPT_names = [""]
+    return SoVITS_names, GPT_names
+def change_choices():
+    SoVITS_names, GPT_names = get_weights_names()
+    return {"choices": SoVITS_names, "__type__": "update"}, {
+        "choices": GPT_names,
+        "__type__": "update",
+    }
+# 推理用的指定模型
+sovits_path = ""
+gpt_path = ""
+is_half_str = os.environ.get("is_half", "True")
+is_half = True if is_half_str.lower() == "true" else False
+is_share_str = os.environ.get("is_share", "False")
+is_share = True if is_share_str.lower() == "true" else False
+cnhubert_path = "pretrained_models/chinese-hubert-base"
+bert_path = "pretrained_models/chinese-roberta-wwm-ext-large"
+pretrained_sovits_path = "pretrained_models/s2G488k.pth"
+pretrained_gpt_path = "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+exp_root = "logs"
+python_exec = sys.executable or "python"
+webui_port_main = 9874
+webui_port_uvr5 = 9873
+webui_port_infer_tts = 9872
+webui_port_subfix = 9871
+api_port = 9880
+# Thanks to the contribution of @Karasukaigan and @XXXXRT666
+def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]:
+    cpu = torch.device("cpu")
+    cuda = torch.device(f"cuda:{idx}")
+    if not torch.cuda.is_available():
+        return cpu, torch.float32, 0.0, 0.0
+    device_idx = idx
+    capability = torch.cuda.get_device_capability(device_idx)
+    name = torch.cuda.get_device_name(device_idx)
+    mem_bytes = torch.cuda.get_device_properties(device_idx).total_memory
+    mem_gb = mem_bytes / (1024**3) + 0.4
+    major, minor = capability
+    sm_version = major + minor / 10.0
+    is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
+    if mem_gb < 4 or sm_version < 5.3:
+        return cpu, torch.float32, 0.0, 0.0
+    if sm_version == 6.1 or is_16_series == True:
+        return cuda, torch.float32, sm_version, mem_gb
+    if sm_version > 6.1:
+        return cuda, torch.float16, sm_version, mem_gb
+    return cpu, torch.float32, 0.0, 0.0
+IS_GPU = True
+GPU_INFOS: list[str] = []
+GPU_INDEX: set[int] = set()
+GPU_COUNT = torch.cuda.device_count()
+CPU_INFO: str = "0\tCPU " + i18n("CPU训练,较慢")
+tmp: list[tuple[torch.device, torch.dtype, float, float]] = []
+memset: set[float] = set()
+for i in range(max(GPU_COUNT, 1)):
+    tmp.append(get_device_dtype_sm(i))
+for j in tmp:
+    device = j[0]
+    memset.add(j[3])
+    if device.type != "cpu":
+        GPU_INFOS.append(f"{device.index}\t{torch.cuda.get_device_name(device.index)}")
+        GPU_INDEX.add(device.index)
+if not GPU_INFOS:
+    IS_GPU = False
+    GPU_INFOS.append(CPU_INFO)
+    GPU_INDEX.add(0)
+infer_device = max(tmp, key=lambda x: (x[2], x[3]))[0]
+is_half = any(dtype == torch.float16 for _, dtype, _, _ in tmp)
+class Config:
+    def __init__(self):
+        self.sovits_path = sovits_path
+        self.gpt_path = gpt_path
+        self.is_half = is_half
+        self.cnhubert_path = cnhubert_path
+        self.bert_path = bert_path
+        self.pretrained_sovits_path = pretrained_sovits_path
+        self.pretrained_gpt_path = pretrained_gpt_path
+        self.exp_root = exp_root
+        self.python_exec = python_exec
+        self.infer_device = infer_device
+        self.webui_port_main = webui_port_main
+        self.webui_port_uvr5 = webui_port_uvr5
+        self.webui_port_infer_tts = webui_port_infer_tts
+        self.webui_port_subfix = webui_port_subfix
+        self.api_port = api_port

inference_webui.py CHANGED Viewed

@@ -29,7 +29,7 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
 logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
 warnings.simplefilter(action="ignore", category=FutureWarning)
-version = model_version = os.environ.get("version", "v2")
 from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
@@ -88,7 +88,7 @@ cnhubert.cnhubert_base_path = cnhubert_base_path
 import random
-from GPT_SoVITS.module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
 def set_seed(seed):

 logging.getLogger("multipart.multipart").setLevel(logging.ERROR)
 warnings.simplefilter(action="ignore", category=FutureWarning)
+version = model_version = os.environ.get("version", "v2ProPlus")
 from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
 import random
+from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
 def set_seed(seed):

requirements.txt CHANGED Viewed

@@ -33,4 +33,5 @@ torch==2.4
 pydantic<=2.10.6
 torchmetrics<=1.5
 fast_langdetect
-split_lang

 pydantic<=2.10.6
 torchmetrics<=1.5
 fast_langdetect
+split_lang
+peft

tools/AP_BWE_main/24kto48k/readme.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+For the inference of the v3 model, if you find that the generated audio sounds somewhat muffled, you can try using this audio super-resolution model.
+对于v3模型的推理，如果你发现生成的音频比较闷，可以尝试这个音频超分模型。
+put g_24kto48k.zip and config.json in this folder
+把g_24kto48k.zip and config.json下到这个文件夹
+download link 下载链接:
+https://drive.google.com/drive/folders/1IIYTf2zbJWzelu4IftKD6ooHloJ8mnZF?usp=share_link
+audio sr project page 音频超分项目主页:
+https://github.com/yxlu-0102/AP-BWE

tools/AP_BWE_main/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Ye-Xin Lu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

tools/AP_BWE_main/README.md ADDED Viewed

	@@ -0,0 +1,91 @@

+# Towards High-Quality and Efficient Speech Bandwidth Extension with Parallel Amplitude and Phase Prediction
+### Ye-Xin Lu, Yang Ai, Hui-Peng Du, Zhen-Hua Ling
+**Abstract:**
+Speech bandwidth extension (BWE) refers to widening the frequency bandwidth range of speech signals, enhancing the speech quality towards brighter and fuller.
+This paper proposes a generative adversarial network (GAN) based BWE model with parallel prediction of Amplitude and Phase spectra, named AP-BWE, which achieves both high-quality and efficient wideband speech waveform generation.
+The proposed AP-BWE generator is entirely based on convolutional neural networks (CNNs).
+It features a dual-stream architecture with mutual interaction, where the amplitude stream and the phase stream communicate with each other and respectively extend the high-frequency components from the input narrowband amplitude and phase spectra.
+To improve the naturalness of the extended speech signals, we employ a multi-period discriminator at the waveform level and design a pair of multi-resolution amplitude and phase discriminators at the spectral level, respectively.
+Experimental results demonstrate that our proposed AP-BWE achieves state-of-the-art performance in terms of speech quality for BWE tasks targeting sampling rates of both 16 kHz and 48 kHz.
+In terms of generation efficiency, due to the all-convolutional architecture and all-frame-level operations, the proposed AP-BWE can generate 48 kHz waveform samples 292.3 times faster than real-time on a single RTX 4090 GPU and 18.1 times faster than real-time on a single CPU.
+Notably, to our knowledge, AP-BWE is the first to achieve the direct extension of the high-frequency phase spectrum, which is beneficial for improving the effectiveness of existing BWE methods.
+**We provide our implementation as open source in this repository. Audio samples can be found at the [demo website](http://yxlu-0102.github.io/AP-BWE).**
+## Pre-requisites
+0. Python >= 3.9.
+0. Clone this repository.
+0. Install python requirements. Please refer [requirements.txt](requirements.txt).
+0. Download datasets
+    1. Download and extract the [VCTK-0.92 dataset](https://datashare.ed.ac.uk/handle/10283/3443), and move its `wav48` directory into [VCTK-Corpus-0.92](VCTK-Corpus-0.92) and rename it as `wav48_origin`.
+    1. Trim the silence of the dataset, and the trimmed files will be saved to `wav48_silence_trimmed`.
+       ```
+       cd VCTK-Corpus-0.92
+       python flac2wav.py
+       ```
+    1. Move all the trimmed training files from `wav48_silence_trimmed` to [wav48/train](wav48/train) following the indexes in [training.txt](VCTK-Corpus-0.92/training.txt), and move all the untrimmed test files from  `wav48_origin` to [wav48/test](wav48/test) following the indexes in [test.txt](VCTK-Corpus-0.92/test.txt).
+## Training
+```
+cd train
+CUDA_VISIBLE_DEVICES=0 python train_16k.py --config [config file path]
+CUDA_VISIBLE_DEVICES=0 python train_48k.py --config [config file path]
+```
+Checkpoints and copies of the configuration file are saved in the `cp_model` directory by default.<br>
+You can change the path by using the `--checkpoint_path` option.
+Here is an example:
+```
+CUDA_VISIBLE_DEVICES=0 python train_16k.py --config ../configs/config_2kto16k.json --checkpoint_path ../checkpoints/AP-BWE_2kto16k
+```
+## Inference
+```
+cd inference
+python inference_16k.py --checkpoint_file [generator checkpoint file path]
+python inference_48k.py --checkpoint_file [generator checkpoint file path]
+```
+You can download the [pretrained weights](https://drive.google.com/drive/folders/1IIYTf2zbJWzelu4IftKD6ooHloJ8mnZF?usp=share_link) we provide and move all the files to the `checkpoints` directory.
+<br>
+Generated wav files are saved in `generated_files` by default.
+You can change the path by adding `--output_dir` option.
+Here is an example:
+```
+python inference_16k.py --checkpoint_file ../checkpoints/2kto16k/g_2kto16k --output_dir ../generated_files/2kto16k
+```
+## Model Structure
+![model](Figures/model.png)
+## Comparison with other speech BWE methods
+### 2k/4k/8kHz to 16kHz
+<p align="center">
+<img src="Figures/table_16k.png" alt="comparison" width="90%"/>
+</p>
+### 8k/12k/16/24kHz to 16kHz
+<p align="center">
+<img src="Figures/table_48k.png" alt="comparison" width="100%"/>
+</p>
+## Acknowledgements
+We referred to [HiFi-GAN](https://github.com/jik876/hifi-gan) and [NSPP](https://github.com/YangAi520/NSPP) to implement this.
+## Citation
+```
+@article{lu2024towards,
+  title={Towards high-quality and efficient speech bandwidth extension with parallel amplitude and phase prediction},
+  author={Lu, Ye-Xin and Ai, Yang and Du, Hui-Peng and Ling, Zhen-Hua},
+  journal={arXiv preprint arXiv:2401.06387},
+  year={2024}
+}
+@inproceedings{lu2024multi,
+  title={Multi-Stage Speech Bandwidth Extension with Flexible Sampling Rate Control},
+  author={Lu, Ye-Xin and Ai, Yang and Sheng, Zheng-Yan and Ling, Zhen-Hua},
+  booktitle={Proc. Interspeech},
+  pages={2270--2274},
+  year={2024}
+}
+```

tools/AP_BWE_main/datasets1/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

tools/AP_BWE_main/datasets1/dataset.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import random
+import torch
+import torchaudio
+import torch.utils.data
+import torchaudio.functional as aF
+def amp_pha_stft(audio, n_fft, hop_size, win_size, center=True):
+    hann_window = torch.hann_window(win_size).to(audio.device)
+    stft_spec = torch.stft(
+        audio,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        return_complex=True,
+    )
+    log_amp = torch.log(torch.abs(stft_spec) + 1e-4)
+    pha = torch.angle(stft_spec)
+    com = torch.stack((torch.exp(log_amp) * torch.cos(pha), torch.exp(log_amp) * torch.sin(pha)), dim=-1)
+    return log_amp, pha, com
+def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True):
+    amp = torch.exp(log_amp)
+    com = torch.complex(amp * torch.cos(pha), amp * torch.sin(pha))
+    hann_window = torch.hann_window(win_size).to(com.device)
+    audio = torch.istft(com, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window, center=center)
+    return audio
+def get_dataset_filelist(a):
+    with open(a.input_training_file, "r", encoding="utf-8") as fi:
+        training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
+    with open(a.input_validation_file, "r", encoding="utf-8") as fi:
+        validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
+    return training_indexes, validation_indexes
+class Dataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        training_indexes,
+        wavs_dir,
+        segment_size,
+        hr_sampling_rate,
+        lr_sampling_rate,
+        split=True,
+        shuffle=True,
+        n_cache_reuse=1,
+        device=None,
+    ):
+        self.audio_indexes = training_indexes
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_indexes)
+        self.wavs_dir = wavs_dir
+        self.segment_size = segment_size
+        self.hr_sampling_rate = hr_sampling_rate
+        self.lr_sampling_rate = lr_sampling_rate
+        self.split = split
+        self.cached_wav = None
+        self.n_cache_reuse = n_cache_reuse
+        self._cache_ref_count = 0
+        self.device = device
+    def __getitem__(self, index):
+        filename = self.audio_indexes[index]
+        if self._cache_ref_count == 0:
+            audio, orig_sampling_rate = torchaudio.load(os.path.join(self.wavs_dir, filename + ".wav"))
+            self.cached_wav = audio
+            self._cache_ref_count = self.n_cache_reuse
+        else:
+            audio = self.cached_wav
+            self._cache_ref_count -= 1
+        if orig_sampling_rate == self.hr_sampling_rate:
+            audio_hr = audio
+        else:
+            audio_hr = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.hr_sampling_rate)
+        audio_lr = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.lr_sampling_rate)
+        audio_lr = aF.resample(audio_lr, orig_freq=self.lr_sampling_rate, new_freq=self.hr_sampling_rate)
+        audio_lr = audio_lr[:, : audio_hr.size(1)]
+        if self.split:
+            if audio_hr.size(1) >= self.segment_size:
+                max_audio_start = audio_hr.size(1) - self.segment_size
+                audio_start = random.randint(0, max_audio_start)
+                audio_hr = audio_hr[:, audio_start : audio_start + self.segment_size]
+                audio_lr = audio_lr[:, audio_start : audio_start + self.segment_size]
+            else:
+                audio_hr = torch.nn.functional.pad(audio_hr, (0, self.segment_size - audio_hr.size(1)), "constant")
+                audio_lr = torch.nn.functional.pad(audio_lr, (0, self.segment_size - audio_lr.size(1)), "constant")
+        return (audio_hr.squeeze(), audio_lr.squeeze())
+    def __len__(self):
+        return len(self.audio_indexes)

tools/AP_BWE_main/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

tools/AP_BWE_main/models/model.py ADDED Viewed

	@@ -0,0 +1,464 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn.utils import weight_norm, spectral_norm
+# from utils import init_weights, get_padding
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+import numpy as np
+from typing import Tuple, List
+LRELU_SLOPE = 0.1
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        layer_scale_init_value=None,
+        adanorm_num_embeddings=None,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.adanorm = adanorm_num_embeddings is not None
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, dim * 3)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(dim * 3, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def forward(self, x, cond_embedding_id=None):
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+        if self.adanorm:
+            assert cond_embedding_id is not None
+            x = self.norm(x, cond_embedding_id)
+        else:
+            x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = residual + x
+        return x
+class APNet_BWE_Model(torch.nn.Module):
+    def __init__(self, h):
+        super(APNet_BWE_Model, self).__init__()
+        self.h = h
+        self.adanorm_num_embeddings = None
+        layer_scale_init_value = 1 / h.ConvNeXt_layers
+        self.conv_pre_mag = nn.Conv1d(h.n_fft // 2 + 1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1))
+        self.norm_pre_mag = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6)
+        self.conv_pre_pha = nn.Conv1d(h.n_fft // 2 + 1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1))
+        self.norm_pre_pha = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6)
+        self.convnext_mag = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    dim=h.ConvNeXt_channels,
+                    layer_scale_init_value=layer_scale_init_value,
+                    adanorm_num_embeddings=self.adanorm_num_embeddings,
+                )
+                for _ in range(h.ConvNeXt_layers)
+            ]
+        )
+        self.convnext_pha = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    dim=h.ConvNeXt_channels,
+                    layer_scale_init_value=layer_scale_init_value,
+                    adanorm_num_embeddings=self.adanorm_num_embeddings,
+                )
+                for _ in range(h.ConvNeXt_layers)
+            ]
+        )
+        self.norm_post_mag = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6)
+        self.norm_post_pha = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6)
+        self.apply(self._init_weights)
+        self.linear_post_mag = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1)
+        self.linear_post_pha_r = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1)
+        self.linear_post_pha_i = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, mag_nb, pha_nb):
+        x_mag = self.conv_pre_mag(mag_nb)
+        x_pha = self.conv_pre_pha(pha_nb)
+        x_mag = self.norm_pre_mag(x_mag.transpose(1, 2)).transpose(1, 2)
+        x_pha = self.norm_pre_pha(x_pha.transpose(1, 2)).transpose(1, 2)
+        for conv_block_mag, conv_block_pha in zip(self.convnext_mag, self.convnext_pha):
+            x_mag = x_mag + x_pha
+            x_pha = x_pha + x_mag
+            x_mag = conv_block_mag(x_mag, cond_embedding_id=None)
+            x_pha = conv_block_pha(x_pha, cond_embedding_id=None)
+        x_mag = self.norm_post_mag(x_mag.transpose(1, 2))
+        mag_wb = mag_nb + self.linear_post_mag(x_mag).transpose(1, 2)
+        x_pha = self.norm_post_pha(x_pha.transpose(1, 2))
+        x_pha_r = self.linear_post_pha_r(x_pha)
+        x_pha_i = self.linear_post_pha_i(x_pha)
+        pha_wb = torch.atan2(x_pha_i, x_pha_r).transpose(1, 2)
+        com_wb = torch.stack((torch.exp(mag_wb) * torch.cos(pha_wb), torch.exp(mag_wb) * torch.sin(pha_wb)), dim=-1)
+        return mag_wb, pha_wb, com_wb
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(nn.Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(nn.Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+            ]
+        )
+        self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for i, l in enumerate(self.convs):
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            if i > 0:
+                fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorP(2),
+                DiscriminatorP(3),
+                DiscriminatorP(5),
+                DiscriminatorP(7),
+                DiscriminatorP(11),
+            ]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiResolutionAmplitudeDiscriminator(nn.Module):
+    def __init__(
+        self,
+        resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
+        num_embeddings: int = None,
+    ):
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorAR(resolution=r, num_embeddings=num_embeddings) for r in resolutions]
+        )
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorAR(nn.Module):
+    def __init__(
+        self,
+        resolution: Tuple[int, int, int],
+        channels: int = 64,
+        in_channels: int = 1,
+        num_embeddings: int = None,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(in_channels, channels, kernel_size=(7, 5), stride=(2, 2), padding=(3, 2))),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1))),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 2), padding=(2, 1))),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 1), padding=1)),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 2), padding=1)),
+            ]
+        )
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), padding=(1, 1)))
+    def forward(
+        self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        fmap = []
+        x = x.squeeze(1)
+        x = self.spectrogram(x)
+        x = x.unsqueeze(1)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+    def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
+        n_fft, hop_length, win_length = self.resolution
+        amplitude_spectrogram = torch.stft(
+            x,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=None,  # interestingly rectangular window kind of works here
+            center=True,
+            return_complex=True,
+        ).abs()
+        return amplitude_spectrogram
+class MultiResolutionPhaseDiscriminator(nn.Module):
+    def __init__(
+        self,
+        resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
+        num_embeddings: int = None,
+    ):
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorPR(resolution=r, num_embeddings=num_embeddings) for r in resolutions]
+        )
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorPR(nn.Module):
+    def __init__(
+        self,
+        resolution: Tuple[int, int, int],
+        channels: int = 64,
+        in_channels: int = 1,
+        num_embeddings: int = None,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(in_channels, channels, kernel_size=(7, 5), stride=(2, 2), padding=(3, 2))),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1))),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 2), padding=(2, 1))),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 1), padding=1)),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 2), padding=1)),
+            ]
+        )
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), padding=(1, 1)))
+    def forward(
+        self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        fmap = []
+        x = x.squeeze(1)
+        x = self.spectrogram(x)
+        x = x.unsqueeze(1)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+    def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
+        n_fft, hop_length, win_length = self.resolution
+        phase_spectrogram = torch.stft(
+            x,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=None,  # interestingly rectangular window kind of works here
+            center=True,
+            return_complex=True,
+        ).angle()
+        return phase_spectrogram
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean(torch.clamp(1 - dr, min=0))
+        g_loss = torch.mean(torch.clamp(1 + dg, min=0))
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean(torch.clamp(1 - dg, min=0))
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses
+def phase_losses(phase_r, phase_g):
+    ip_loss = torch.mean(anti_wrapping_function(phase_r - phase_g))
+    gd_loss = torch.mean(anti_wrapping_function(torch.diff(phase_r, dim=1) - torch.diff(phase_g, dim=1)))
+    iaf_loss = torch.mean(anti_wrapping_function(torch.diff(phase_r, dim=2) - torch.diff(phase_g, dim=2)))
+    return ip_loss, gd_loss, iaf_loss
+def anti_wrapping_function(x):
+    return torch.abs(x - torch.round(x / (2 * np.pi)) * 2 * np.pi)
+def stft_mag(audio, n_fft=2048, hop_length=512):
+    hann_window = torch.hann_window(n_fft).to(audio.device)
+    stft_spec = torch.stft(audio, n_fft, hop_length, window=hann_window, return_complex=True)
+    stft_mag = torch.abs(stft_spec)
+    return stft_mag
+def cal_snr(pred, target):
+    snr = (20 * torch.log10(torch.norm(target, dim=-1) / torch.norm(pred - target, dim=-1).clamp(min=1e-8))).mean()
+    return snr
+def cal_lsd(pred, target):
+    sp = torch.log10(stft_mag(pred).square().clamp(1e-8))
+    st = torch.log10(stft_mag(target).square().clamp(1e-8))
+    return (sp - st).square().mean(dim=1).sqrt().mean()

tools/assets.py ADDED Viewed

	@@ -0,0 +1,70 @@

+js = """
+function deleteTheme() {
+const params = new URLSearchParams(window.location.search);
+if (params.has('__theme')) {
+    params.delete('__theme');
+    const newUrl = `${window.location.pathname}?${params.toString()}`;
+    window.location.replace(newUrl);
+}
+}
+"""
+css = """
+/* CSSStyleRule */
+.markdown {
+    padding: 6px 10px;
+}
+@media (prefers-color-scheme: light) {
+    .markdown {
+        background-color: lightblue;
+        color: #000;
+    }
+}
+@media (prefers-color-scheme: dark) {
+    .markdown {
+        background-color: #4b4b4b;
+        color: rgb(244, 244, 245);
+    }
+}
+::selection {
+    background: #ffc078 !important;
+}
+footer {
+    height: 50px !important;           /* 设置页脚高度 */
+    background-color: transparent !important; /* 背景透明 */
+    display: flex;
+    justify-content: center;           /* 居中对齐 */
+    align-items: center;               /* 垂直居中 */
+}
+footer * {
+    display: none !important;          /* 隐藏所有子元素 */
+}
+"""
+top_html = """
+<div align="center">
+    <div style="margin-bottom: 5px; font-size: 15px;">{}</div>
+    <div style="display: flex; gap: 80px; justify-content: center;">
+        <a href="https://github.com/RVC-Boss/GPT-SoVITS" target="_blank">
+            <img src="https://img.shields.io/badge/GitHub-GPT--SoVITS-blue.svg?style=for-the-badge&logo=github" style="width: auto; height: 30px;">
+        </a>
+        <a href="https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e" target="_blank">
+            <img src="https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white" style="width: auto; height: 30px;">
+        </a>
+        <a href="https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e" target="_blank">
+            <img src="https://img.shields.io/badge/English-READ%20DOCS-blue?style=for-the-badge&logo=googledocs&logoColor=white" style="width: auto; height: 30px;">
+        </a>
+        <a href="https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE" target="_blank">
+            <img src="https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative" style="width: auto; height: 30px;">
+        </a>
+    </div>
+</div>
+"""

tools/audio_sr.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from __future__ import absolute_import, division, print_function, unicode_literals
+import sys
+import os
+AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main")
+sys.path.append(AP_BWE_main_dir_path)
+import json
+import torch
+import torchaudio.functional as aF
+# from attrdict import AttrDict####will be bug in py3.10
+from datasets1.dataset import amp_pha_stft, amp_pha_istft
+from models.model import APNet_BWE_Model
+class AP_BWE:
+    def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
+        if checkpoint_file == None:
+            checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path)
+            if os.path.exists(checkpoint_file) == False:
+                raise FileNotFoundError
+        config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
+        with open(config_file) as f:
+            data = f.read()
+        json_config = json.loads(data)
+        # h = AttrDict(json_config)
+        h = DictToAttrRecursive(json_config)
+        model = APNet_BWE_Model(h).to(device)
+        state_dict = torch.load(checkpoint_file, map_location="cpu", weights_only=False)
+        model.load_state_dict(state_dict["generator"])
+        model.eval()
+        self.device = device
+        self.model = model
+        self.h = h
+    def to(self, *arg, **kwargs):
+        self.model.to(*arg, **kwargs)
+        self.device = self.model.conv_pre_mag.weight.device
+        return self
+    def __call__(self, audio, orig_sampling_rate):
+        with torch.no_grad():
+            # audio, orig_sampling_rate = torchaudio.load(inp_path)
+            # audio = audio.to(self.device)
+            audio = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.h.hr_sampling_rate)
+            amp_nb, pha_nb, com_nb = amp_pha_stft(audio, self.h.n_fft, self.h.hop_size, self.h.win_size)
+            amp_wb_g, pha_wb_g, com_wb_g = self.model(amp_nb, pha_nb)
+            audio_hr_g = amp_pha_istft(amp_wb_g, pha_wb_g, self.h.n_fft, self.h.hop_size, self.h.win_size)
+            # sf.write(opt_path, audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate, 'PCM_16')
+            return audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate

weight.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"GPT": {}, "SoVITS": {}}