Spaces:

kevinwang676
/

GPT-SoVITS-VC

Runtime error

App Files Files Community

kevinwang676 commited on Jul 15, 2024

Commit

b8292ba

verified ·

1 Parent(s): c2d2c2b

Delete GPT-SoVITS-models

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

GPT-SoVITS-models/.gitattributes +0 -44
GPT-SoVITS-models/GPT-SoVITS/.ipynb_checkpoints/del-checkpoint.sh +0 -12
GPT-SoVITS-models/GPT-SoVITS/.ipynb_checkpoints/webui-checkpoint.py +0 -719
GPT-SoVITS-models/GPT-SoVITS/.ipynb_checkpoints/启动webui-checkpoint.sh +0 -2
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/.ipynb_checkpoints/inference_webui-checkpoint.py +0 -270
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/__init__.py +0 -0
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/data/__init__.py +0 -0
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/data/bucket_sampler.py +0 -157
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/data/data_module.py +0 -66
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/data/dataset.py +0 -302
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/__init__.py +0 -0
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/BEATs.py +0 -179
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/README.md +0 -127
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/Tokenizers.py +0 -172
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/__init__.py +0 -2
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/backbone.py +0 -791
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/config.py +0 -19
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/modules.py +0 -220
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/ontology.json +0 -0
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/quantizer.py +0 -235
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/get_beats_librilight.py +0 -321
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/get_phones.py +0 -232
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/get_phones_librilight.py +0 -198
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/get_txt_librilight.py +0 -255
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/split_train_val.py +0 -35
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/t2s.py +0 -197
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/test.py +0 -139
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/text.txt +0 -10
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/train.py +0 -103
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/train_librilight_6k.py +0 -170
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/models/__init__.py +0 -0
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/models/t2s_lightning_module.py +0 -128
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/models/t2s_model.py +0 -298
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/models/utils.py +0 -164
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/__init__.py +0 -0
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/activation.py +0 -397
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/embedding.py +0 -78
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/lr_schedulers.py +0 -85
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/optim.py +0 -622
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/patched_mha_with_cache.py +0 -388
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/scaling.py +0 -319
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/transformer.py +0 -347
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/text_processing/__init__.py +0 -0
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/text_processing/phonemizer.py +0 -80
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/text_processing/symbols.py +0 -9
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/utils/__init__.py +0 -37
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/utils/initialize.py +0 -38
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/utils/io.py +0 -32
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/configs/s1.yaml +0 -31
GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/configs/s1big.yaml +0 -31

GPT-SoVITS-models/.gitattributes DELETED Viewed

@@ -1,44 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-GPT-SoVITS/TEMP/gradio/2bbf387613664982acc3847e4b4970fc6bf09120/audio.wav filter=lfs diff=lfs merge=lfs -text
-GPT-SoVITS/TEMP/gradio/4e25df8e5470697bd435cc94559e1c34f09bab16/audio.wav filter=lfs diff=lfs merge=lfs -text
-GPT-SoVITS/TEMP/gradio/6b579cccde8715941d9b9b06a1b9787ce0fdb4db/audio.wav filter=lfs diff=lfs merge=lfs -text
-GPT-SoVITS/TEMP/gradio/873c1f03462a87c00222fd2422a8b328244f45da/audio.wav filter=lfs diff=lfs merge=lfs -text
-GPT-SoVITS/TEMP/gradio/d2c38e2d7f131cfc51fe07c541177b0f5a061cc3/audio.wav filter=lfs diff=lfs merge=lfs -text
-GPT-SoVITS/TEMP/gradio/e6f05e0d768171ac3b7355d968cb1badf9d84864/wyxy_101-0-100.wav filter=lfs diff=lfs merge=lfs -text
-GPT-SoVITS/TEMP/gradio/e6f05e0d768171ac3b7355d968cb1badf9d84864/wyxy_101.wav filter=lfs diff=lfs merge=lfs -text
-GPT-SoVITS/TEMP/jieba.cache filter=lfs diff=lfs merge=lfs -text
-GPT-SoVITS/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav filter=lfs diff=lfs merge=lfs -text

GPT-SoVITS-models/GPT-SoVITS/.ipynb_checkpoints/del-checkpoint.sh DELETED Viewed

@@ -1,12 +0,0 @@
-#!/bin/bash
-cd /root/autodl-tmp/workdir/GPT-SoVITS
-rm -rf GPT_weights/*
-rm -rf SoVITS_weights/*
-rm -rf input/*
-rm -rf output/asr_opt/*
-rm -rf output/slicer_opt/*
-rm -rf output/uvr5_opt/*
-rm -rf logs/*
-echo 初始化完成

GPT-SoVITS-models/GPT-SoVITS/.ipynb_checkpoints/webui-checkpoint.py DELETED Viewed

@@ -1,719 +0,0 @@
-import json,yaml,warnings,torch
-warnings.filterwarnings("ignore")
-torch.manual_seed(233333)
-import os,pdb,sys
-now_dir = os.getcwd()
-tmp = os.path.join(now_dir, "TEMP")
-os.makedirs(tmp, exist_ok=True)
-os.environ["TEMP"] = tmp
-import site
-site_packages_root="%s/root/miniconda3/lib/python3.10/site-packages"%now_dir
-for path in site.getsitepackages():
-    if("site-packages"in path):site_packages_root=path
-os.environ["OPENBLAS_NUM_THREADS"] = "4"
-os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
-with open("%s/users.pth"%(site_packages_root),"w")as f:
-    f.write("%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"%(now_dir,now_dir,now_dir,now_dir,now_dir))
-import traceback
-sys.path.append(now_dir)
-import shutil
-import pdb
-import gradio as gr
-from subprocess import Popen
-import signal
-from config import python_exec,infer_device,is_half,exp_root
-from i18n.i18n import I18nAuto
-i18n = I18nAuto()
-from scipy.io import wavfile
-from tools.my_utils import load_audio
-from multiprocessing import cpu_count
-n_cpu=cpu_count()
-# 判断是否有能用来训练和加速推理的N卡
-ngpu = torch.cuda.device_count()
-gpu_infos = []
-mem = []
-if_gpu_ok = False
-if torch.cuda.is_available() or ngpu != 0:
-    for i in range(ngpu):
-        gpu_name = torch.cuda.get_device_name(i)
-        if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L"]):
-            # A10#A100#V100#A40#P40#M40#K80#A4500
-            if_gpu_ok = True  # 至少有一张能用的N卡
-            gpu_infos.append("%s\t%s" % (i, gpu_name))
-            mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
-if if_gpu_ok and len(gpu_infos) > 0:
-    gpu_info = "\n".join(gpu_infos)
-    default_batch_size = min(mem) // 2
-else:
-    gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
-    default_batch_size = 1
-gpus = "-".join([i[0] for i in gpu_infos])
-pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
-pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
-def get_weights_names():
-    SoVITS_names = [pretrained_sovits_name]
-    for name in os.listdir(SoVITS_weight_root):
-        if name.endswith(".pth"):SoVITS_names.append(name)
-    GPT_names = [pretrained_gpt_name]
-    for name in os.listdir(GPT_weight_root):
-        if name.endswith(".ckpt"): GPT_names.append(name)
-    return SoVITS_names,GPT_names
-SoVITS_weight_root="SoVITS_weights"
-GPT_weight_root="GPT_weights"
-SoVITS_names,GPT_names = get_weights_names()
-def change_choices():
-    SoVITS_names, GPT_names = get_weights_names()
-    return {"choices": sorted(SoVITS_names), "__type__": "update"}, {"choices": sorted(GPT_names), "__type__": "update"}
-p_label=None
-p_uvr5=None
-p_asr=None
-p_tts_inference=None
-def kill_process(pid):
-    os.system("taskkill /t /f /pid %s" % pid)  # todo：识别linux用kill -9
-    # os.kill(p_label.pid,19)#主进程#控制台进程#python子进程###不好使，连主进程的webui一起关了，辣鸡
-def change_label(if_label,path_list):
-    global p_label
-    if(if_label==True and p_label==None):
-        cmd = '"%s" tools/subfix_webui.py --load_list "%s"'%(python_exec,path_list)
-        yield "打标工具WebUI已开启"
-        print(cmd)
-        p_label = Popen(cmd, shell=True)
-    elif(if_label==False and p_label!=None):
-        kill_process(p_label.pid)
-        p_label=None
-        yield "打标工具WebUI已关闭"
-def change_uvr5(if_uvr5):
-    global p_uvr5
-    if(if_uvr5==True and p_uvr5==None):
-        cmd = '"%s" tools/uvr5/webui.py "%s" %s'%(python_exec,infer_device,is_half)
-        yield "UVR5已开启"
-        print(cmd)
-        p_uvr5 = Popen(cmd, shell=True)
-    elif(if_uvr5==False and p_uvr5!=None):
-        kill_process(p_uvr5.pid)
-        p_uvr5=None
-        yield "UVR5已关闭"
-def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path):
-    global p_tts_inference
-    if(if_tts==True and p_tts_inference==None):
-        os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
-        os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
-        os.environ["cnhubert_base_path"]=cnhubert_base_path
-        os.environ["bert_path"]=bert_path
-        os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
-        os.environ["is_half"]=str(is_half)
-        cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec)
-        yield "TTS推理进程已开启"
-        print(cmd)
-        p_tts_inference = Popen(cmd, shell=True)
-    elif(if_tts==False and p_tts_inference!=None):
-        kill_process(p_tts_inference.pid)
-        p_tts_inference=None
-        yield "TTS推理进程已关闭"
-def open_asr(asr_inp_dir):
-    global p_asr
-    if(p_asr==None):
-        cmd = '"%s" tools/damo_asr/cmd-asr.py "%s"'%(python_exec,asr_inp_dir)
-        yield "ASR任务开启：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-        print(cmd)
-        p_asr = Popen(cmd, shell=True)
-        p_asr.wait()
-        p_asr=None
-        yield "ASR任务完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的ASR任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-def close_asr():
-    global p_asr
-    if(p_asr!=None):
-        kill_process(p_asr.pid)
-        p_asr=None
-    return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-'''
-            button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Bb,button1Ba_open,button1Ba_close])
-            button1Ba_close.click(close1Ba, [], [info1Bb,button1Ba_open,button1Ba_close])
-'''
-p_train_SoVITS=None
-def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
-    global p_train_SoVITS
-    if(p_train_SoVITS==None):
-        with open("GPT_SoVITS/configs/s2.json")as f:
-            data=f.read()
-            data=json.loads(data)
-        s2_dir="%s/%s"%(exp_root,exp_name)
-        os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True)
-        data["train"]["batch_size"]=batch_size
-        data["train"]["epochs"]=total_epoch
-        data["train"]["text_low_lr_rate"]=text_low_lr_rate
-        data["train"]["pretrained_s2G"]=pretrained_s2G
-        data["train"]["pretrained_s2D"]=pretrained_s2D
-        data["train"]["if_save_latest"]=if_save_latest
-        data["train"]["if_save_every_weights"]=if_save_every_weights
-        data["train"]["save_every_epoch"]=save_every_epoch
-        data["train"]["gpu_numbers"]=gpu_numbers1Ba
-        data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir
-        data["save_weight_dir"]=SoVITS_weight_root
-        data["name"]=exp_name
-        tmp_config_path="TEMP/tmp_s2.json"
-        with open(tmp_config_path,"w")as f:f.write(json.dumps(data))
-        cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
-        yield "SoVITS训练开始：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-        print(cmd)
-        p_train_SoVITS = Popen(cmd, shell=True)
-        p_train_SoVITS.wait()
-        p_train_SoVITS=None
-        yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的SoVITS训练任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-def close1Ba():
-    global p_train_SoVITS
-    if(p_train_SoVITS!=None):
-        kill_process(p_train_SoVITS.pid)
-        p_train_SoVITS=None
-    return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-p_train_GPT=None
-def open1Bb(batch_size,total_epoch,exp_name,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1):
-    global p_train_GPT
-    if(p_train_GPT==None):
-        with open("GPT_SoVITS/configs/s1longer.yaml")as f:
-            data=f.read()
-            data=yaml.load(data, Loader=yaml.FullLoader)
-        s1_dir="%s/%s"%(exp_root,exp_name)
-        os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True)
-        data["train"]["batch_size"]=batch_size
-        data["train"]["epochs"]=total_epoch
-        data["pretrained_s1"]=pretrained_s1
-        data["train"]["save_every_n_epoch"]=save_every_epoch
-        data["train"]["if_save_every_weights"]=if_save_every_weights
-        data["train"]["if_save_latest"]=if_save_latest
-        data["train"]["half_weights_save_dir"]=GPT_weight_root
-        data["train"]["exp_name"]=exp_name
-        data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir
-        data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir
-        data["output_dir"]="%s/logs_s1"%s1_dir
-        os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",")
-        os.environ["hz"]="25hz"
-        tmp_config_path="TEMP/tmp_s1.yaml"
-        with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False))
-        # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir)
-        cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path)
-        yield "GPT训练开始：%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-        print(cmd)
-        p_train_GPT = Popen(cmd, shell=True)
-        p_train_GPT.wait()
-        p_train_GPT=None
-        yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的GPT训练任务，需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
-def close1Bb():
-    global p_train_GPT
-    if(p_train_GPT!=None):
-        kill_process(p_train_GPT.pid)
-        p_train_GPT=None
-    return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-ps_slice=[]
-def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts):
-    global ps_slice
-    if(os.path.exists(inp)==False):
-        yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-        return
-    if os.path.isfile(inp):n_parts=1
-    elif os.path.isdir(inp):pass
-    else:
-        yield "输入路径存在但既不是文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-        return
-    if (ps_slice == []):
-        for i_part in range(n_parts):
-            cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts)
-            print(cmd)
-            p = Popen(cmd, shell=True)
-            ps_slice.append(p)
-        yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-        for p in ps_slice:
-            p.wait()
-        ps_slice=[]
-        yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的切割任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close_slice():
-    global ps_slice
-    if (ps_slice != []):
-        for p_slice in ps_slice:
-            try:
-                kill_process(p_slice.pid)
-            except:
-                traceback.print_exc()
-        ps_slice=[]
-    return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-'''
-inp_text=                           os.environ.get("inp_text")
-inp_wav_dir=                        os.environ.get("inp_wav_dir")
-exp_name=                           os.environ.get("exp_name")
-i_part=                             os.environ.get("i_part")
-all_parts=                          os.environ.get("all_parts")
-os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES")
-opt_dir=                            os.environ.get("opt_dir")#"/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
-bert_pretrained_dir=                os.environ.get("bert_pretrained_dir")#"/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large"
-'''
-ps1a=[]
-def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir):
-    global ps1a
-    if (ps1a == []):
-        config={
-            "inp_text":inp_text,
-            "inp_wav_dir":inp_wav_dir,
-            "exp_name":exp_name,
-            "opt_dir":"%s/%s"%(exp_root,exp_name),
-            "bert_pretrained_dir":bert_pretrained_dir,
-        }
-        gpu_names=gpu_numbers.split("-")
-        all_parts=len(gpu_names)
-        for i_part in range(all_parts):
-            config.update(
-                {
-                    "i_part": str(i_part),
-                    "all_parts": str(all_parts),
-                    "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                    "is_half": str(is_half)
-                }
-            )
-            os.environ.update(config)
-            cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
-            print(cmd)
-            p = Popen(cmd, shell=True)
-            ps1a.append(p)
-        yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-        for p in ps1a:
-            p.wait()
-        ps1a=[]
-        yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的文本任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close1a():
-    global ps1a
-    if (ps1a != []):
-        for p1a in ps1a:
-            try:
-                kill_process(p1a.pid)
-            except:
-                traceback.print_exc()
-        ps1a=[]
-    return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-'''
-inp_text=                           os.environ.get("inp_text")
-inp_wav_dir=                        os.environ.get("inp_wav_dir")
-exp_name=                           os.environ.get("exp_name")
-i_part=                             os.environ.get("i_part")
-all_parts=                          os.environ.get("all_parts")
-os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES")
-opt_dir=                            os.environ.get("opt_dir")
-cnhubert.cnhubert_base_path=                os.environ.get("cnhubert_base_dir")
-'''
-ps1b=[]
-def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir):
-    global ps1b
-    if (ps1b == []):
-        config={
-            "inp_text":inp_text,
-            "inp_wav_dir":inp_wav_dir,
-            "exp_name":exp_name,
-            "opt_dir":"%s/%s"%(exp_root,exp_name),
-            "cnhubert_base_dir":ssl_pretrained_dir,
-            "is_half": str(is_half)
-        }
-        gpu_names=gpu_numbers.split("-")
-        all_parts=len(gpu_names)
-        for i_part in range(all_parts):
-            config.update(
-                {
-                    "i_part": str(i_part),
-                    "all_parts": str(all_parts),
-                    "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                }
-            )
-            os.environ.update(config)
-            cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
-            print(cmd)
-            p = Popen(cmd, shell=True)
-            ps1b.append(p)
-        yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-        for p in ps1b:
-            p.wait()
-        ps1b=[]
-        yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的SSL提取任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close1b():
-    global ps1b
-    if (ps1b != []):
-        for p1b in ps1b:
-            try:
-                kill_process(p1b.pid)
-            except:
-                traceback.print_exc()
-        ps1b=[]
-    return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-'''
-inp_text=                           os.environ.get("inp_text")
-exp_name=                           os.environ.get("exp_name")
-i_part=                             os.environ.get("i_part")
-all_parts=                          os.environ.get("all_parts")
-os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES")
-opt_dir=                            os.environ.get("opt_dir")
-pretrained_s2G=                     os.environ.get("pretrained_s2G")
-'''
-ps1c=[]
-def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path):
-    global ps1c
-    if (ps1c == []):
-        config={
-            "inp_text":inp_text,
-            "exp_name":exp_name,
-            "opt_dir":"%s/%s"%(exp_root,exp_name),
-            "pretrained_s2G":pretrained_s2G_path,
-            "s2config_path":"GPT_SoVITS/configs/s2.json",
-            "is_half": str(is_half)
-        }
-        gpu_names=gpu_numbers.split("-")
-        all_parts=len(gpu_names)
-        for i_part in range(all_parts):
-            config.update(
-                {
-                    "i_part": str(i_part),
-                    "all_parts": str(all_parts),
-                    "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                }
-            )
-            os.environ.update(config)
-            cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
-            print(cmd)
-            p = Popen(cmd, shell=True)
-            ps1c.append(p)
-        yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-        for p in ps1c:
-            p.wait()
-        ps1c=[]
-        yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
-    else:
-        yield "已有正在进行的语义token提取任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close1c():
-    global ps1c
-    if (ps1c != []):
-        for p1c in ps1c:
-            try:
-                kill_process(p1c.pid)
-            except:
-                traceback.print_exc()
-        ps1c=[]
-    return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-#####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G
-ps1abc=[]
-def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path):
-    global ps1abc
-    if (ps1abc == []):
-        opt_dir="%s/%s"%(exp_root,exp_name)
-        try:
-            #############################1a
-            path_text="%s/2-name2text.txt" % opt_dir
-            if(os.path.exists(path_text)==False):
-                config={
-                    "inp_text":inp_text,
-                    "inp_wav_dir":inp_wav_dir,
-                    "exp_name":exp_name,
-                    "opt_dir":opt_dir,
-                    "bert_pretrained_dir":bert_pretrained_dir,
-                    "is_half": str(is_half)
-                }
-                gpu_names=gpu_numbers1a.split("-")
-                all_parts=len(gpu_names)
-                for i_part in range(all_parts):
-                    config.update(
-                        {
-                            "i_part": str(i_part),
-                            "all_parts": str(all_parts),
-                            "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                        }
-                    )
-                    os.environ.update(config)
-                    cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
-                    print(cmd)
-                    p = Popen(cmd, shell=True)
-                    ps1abc.append(p)
-                yield "进度：1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-                for p in ps1abc:p.wait()
-                opt = []
-                for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part)
-                    txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
-                    with open(txt_path, "r",encoding="utf8") as f:
-                        opt += f.read().strip("\n").split("\n")
-                    os.remove(txt_path)
-                with open(path_text, "w",encoding="utf8") as f:
-                    f.write("\n".join(opt) + "\n")
-            yield "进度：1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-            ps1abc=[]
-            #############################1b
-            config={
-                "inp_text":inp_text,
-                "inp_wav_dir":inp_wav_dir,
-                "exp_name":exp_name,
-                "opt_dir":opt_dir,
-                "cnhubert_base_dir":ssl_pretrained_dir,
-            }
-            gpu_names=gpu_numbers1Ba.split("-")
-            all_parts=len(gpu_names)
-            for i_part in range(all_parts):
-                config.update(
-                    {
-                        "i_part": str(i_part),
-                        "all_parts": str(all_parts),
-                        "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                    }
-                )
-                os.environ.update(config)
-                cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
-                print(cmd)
-                p = Popen(cmd, shell=True)
-                ps1abc.append(p)
-            yield "进度：1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-            for p in ps1abc:p.wait()
-            yield "进度：1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-            ps1abc=[]
-            #############################1c
-            path_semantic = "%s/6-name2semantic.tsv" % opt_dir
-            if(os.path.exists(path_semantic)==False):
-                config={
-                    "inp_text":inp_text,
-                    "exp_name":exp_name,
-                    "opt_dir":opt_dir,
-                    "pretrained_s2G":pretrained_s2G_path,
-                    "s2config_path":"GPT_SoVITS/configs/s2.json",
-                }
-                gpu_names=gpu_numbers1c.split("-")
-                all_parts=len(gpu_names)
-                for i_part in range(all_parts):
-                    config.update(
-                        {
-                            "i_part": str(i_part),
-                            "all_parts": str(all_parts),
-                            "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
-                        }
-                    )
-                    os.environ.update(config)
-                    cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
-                    print(cmd)
-                    p = Popen(cmd, shell=True)
-                    ps1abc.append(p)
-                yield "进度：1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-                for p in ps1abc:p.wait()
-                opt = ["item_name	semantic_audio"]
-                for i_part in range(all_parts):
-                    semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
-                    with open(semantic_path, "r",encoding="utf8") as f:
-                        opt += f.read().strip("\n").split("\n")
-                    os.remove(semantic_path)
-                with open(path_semantic, "w",encoding="utf8") as f:
-                    f.write("\n".join(opt) + "\n")
-                yield "进度：all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-            ps1abc = []
-            yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-        except:
-            traceback.print_exc()
-            close1abc()
-            yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-    else:
-        yield "已有正在进行的一键三连任务，需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
-def close1abc():
-    global ps1abc
-    if (ps1abc != []):
-        for p1abc in ps1abc:
-            try:
-                kill_process(p1abc.pid)
-            except:
-                traceback.print_exc()
-        ps1abc=[]
-    return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
-with gr.Blocks(title="GPT-SoVITS WebUI") as app:
-    gr.Markdown(
-        value=
-            "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>."
-    )
-    with gr.Tabs():
-        with gr.TabItem("0-前置数据集获取工具"):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
-            gr.Markdown(value="0a-UVR5人声伴奏分离&去混响去延迟工具")
-            with gr.Row():
-                if_uvr5 = gr.Checkbox(label="是否开启UVR5-WebUI",show_label=True)
-                uvr5_info = gr.Textbox(label="UVR5进程输出信息")
-            gr.Markdown(value="0b-语音切分工具")
-            with gr.Row():
-                with gr.Row():
-                    slice_inp_path=gr.Textbox(label="音频自动切分输入路径，可文件可文件夹",value="")
-                    slice_opt_root=gr.Textbox(label="切分后的子音频的输出根目录",value="output/slicer_opt")
-                    threshold=gr.Textbox(label="threshold:音量小于这个值视作静音的备选切割点",value="-34")
-                    min_length=gr.Textbox(label="min_length:每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值",value="4000")
-                    min_interval=gr.Textbox(label="min_interval:最短切割间隔",value="300")
-                    hop_size=gr.Textbox(label="hop_size:怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）",value="10")
-                    max_sil_kept=gr.Textbox(label="max_sil_kept:切完后静音最多留多长",value="500")
-                with gr.Row():
-                    open_slicer_button=gr.Button("开启语音切割", variant="primary",visible=True)
-                    close_slicer_button=gr.Button("终止语音切割", variant="primary",visible=False)
-                    _max=gr.Slider(minimum=0,maximum=1,step=0.05,label="max:归一化后最大值多少",value=0.9,interactive=True)
-                    alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label="alpha_mix:混多少比例归一化后音频进来",value=0.25,interactive=True)
-                    n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label="切割使用的进程数",value=4,interactive=True)
-                    slicer_info = gr.Textbox(label="语音切割进程输出信息")
-            gr.Markdown(value="0c-中文批量离线ASR工具")
-            with gr.Row():
-                open_asr_button = gr.Button("开启离线批量ASR", variant="primary",visible=True)
-                close_asr_button = gr.Button("终止ASR进程", variant="primary",visible=False)
-                asr_inp_dir = gr.Textbox(
-                    label="批量ASR(中文only)输入文件夹路径",
-                    value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx",
-                    interactive=True,
-                )
-                asr_info = gr.Textbox(label="ASR进程输出信息")
-            gr.Markdown(value="0d-语音文本校对标注工具")
-            with gr.Row():
-                if_label = gr.Checkbox(label="是否开启打标WebUI",show_label=True)
-                path_list = gr.Textbox(
-                    label="打标数据标注文件路径",
-                    value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list",
-                    interactive=True,
-                )
-                label_info = gr.Textbox(label="打标工具进程输出信息")
-            if_label.change(change_label, [if_label,path_list], [label_info])
-            if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info])
-            open_asr_button.click(open_asr, [asr_inp_dir], [asr_info,open_asr_button,close_asr_button])
-            close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
-            open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
-            close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
-        with gr.TabItem("1-GPT-SoVITS-TTS"):
-            with gr.Row():
-                exp_name = gr.Textbox(label="*实验/模型名", value="xxx", interactive=True)
-                gpu_info = gr.Textbox(label="显卡信息", value=gpu_info, visible=True, interactive=False)
-                pretrained_s2G = gr.Textbox(label="预训练的SoVITS-G模型路径", value="GPT_SoVITS/pretrained_models/s2G488k.pth", interactive=True)
-                pretrained_s2D = gr.Textbox(label="预训练的SoVITS-D模型路径", value="GPT_SoVITS/pretrained_models/s2D488k.pth", interactive=True)
-                pretrained_s1 = gr.Textbox(label="预训练的GPT模型路径", value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", interactive=True)
-            with gr.TabItem("1A-训练集格式化工具"):
-                gr.Markdown(value="输出logs/实验名目录下应有23456开头的文件和文件夹")
-                with gr.Row():
-                    inp_text = gr.Textbox(label="*文本标注文件",value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list",interactive=True)
-                    inp_wav_dir = gr.Textbox(label="*训练集音频文件目录",value=r"D:\RVC1006\GPT-SoVITS\raw\xxx",interactive=True)
-                gr.Markdown(value="1Aa-文本内容")
-                with gr.Row():
-                    gpu_numbers1a = gr.Textbox(label="GPU卡号以-分割，每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True)
-                    bert_pretrained_dir = gr.Textbox(label="预训练的中文BERT模型路径",value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",interactive=False)
-                    button1a_open = gr.Button("开启文本获取", variant="primary",visible=True)
-                    button1a_close = gr.Button("终止文本获取进程", variant="primary",visible=False)
-                    info1a=gr.Textbox(label="文本进程输出信息")
-                gr.Markdown(value="1Ab-SSL自监督特征提取")
-                with gr.Row():
-                    gpu_numbers1Ba = gr.Textbox(label="GPU卡号以-分割，每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True)
-                    cnhubert_base_dir = gr.Textbox(label="预训练的SSL模型路径",value="GPT_SoVITS/pretrained_models/chinese-hubert-base",interactive=False)
-                    button1b_open = gr.Button("开启SSL提取", variant="primary",visible=True)
-                    button1b_close = gr.Button("终止SSL提取进程", variant="primary",visible=False)
-                    info1b=gr.Textbox(label="SSL进程输出信息")
-                gr.Markdown(value="1Ac-语义token提取")
-                with gr.Row():
-                    gpu_numbers1c = gr.Textbox(label="GPU卡号以-分割，每个卡号一个进程",value="%s-%s"%(gpus,gpus),interactive=True)
-                    button1c_open = gr.Button("开启语义token提取", variant="primary",visible=True)
-                    button1c_close = gr.Button("终止语义token提取进程", variant="primary",visible=False)
-                    info1c=gr.Textbox(label="语义token提取进程输出信息")
-                gr.Markdown(value="1Aabc-训练集格式化一键三连")
-                with gr.Row():
-                    button1abc_open = gr.Button("开启一键三连", variant="primary",visible=True)
-                    button1abc_close = gr.Button("终止一键三连", variant="primary",visible=False)
-                    info1abc=gr.Textbox(label="一键三连进程输出信息")
-            button1a_open.click(open1a, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,bert_pretrained_dir], [info1a,button1a_open,button1a_close])
-            button1a_close.click(close1a, [], [info1a,button1a_open,button1a_close])
-            button1b_open.click(open1b, [inp_text,inp_wav_dir,exp_name,gpu_numbers1Ba,cnhubert_base_dir], [info1b,button1b_open,button1b_close])
-            button1b_close.click(close1b, [], [info1b,button1b_open,button1b_close])
-            button1c_open.click(open1c, [inp_text,exp_name,gpu_numbers1c,pretrained_s2G], [info1c,button1c_open,button1c_close])
-            button1c_close.click(close1c, [], [info1c,button1c_open,button1c_close])
-            button1abc_open.click(open1abc, [inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G], [info1abc,button1abc_open,button1abc_close])
-            button1abc_close.click(close1abc, [], [info1abc,button1abc_open,button1abc_close])
-            with gr.TabItem("1B-微调训练"):
-                gr.Markdown(value="1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。")
-                with gr.Row():
-                    batch_size = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
-                    total_epoch = gr.Slider(minimum=2,maximum=100,step=1,label=i18n("总训练轮数total_epoch，不建议太高"),value=10,interactive=True)
-                    text_low_lr_rate = gr.Slider(minimum=0.2,maximum=0.6,step=0.05,label="文本模块学习率权重",value=0.4,interactive=True)
-                    save_every_epoch = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True)
-                    if_save_latest = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
-                    if_save_every_weights = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
-                    gpu_numbers1Ba = gr.Textbox(label="GPU卡号以-分割，每个卡号一个进程", value="%s" % (gpus), interactive=True)
-                with gr.Row():
-                    button1Ba_open = gr.Button("开启SoVITS训练", variant="primary",visible=True)
-                    button1Ba_close = gr.Button("终止SoVITS训练", variant="primary",visible=False)
-                    info1Ba=gr.Textbox(label="SoVITS训练进程输出信息")
-                gr.Markdown(value="1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。")
-                with gr.Row():
-                    batch_size1Bb = gr.Slider(minimum=1,maximum=40,step=1,label=i18n("每张显卡的batch_size"),value=default_batch_size,interactive=True)
-                    total_epoch1Bb = gr.Slider(minimum=2,maximum=200,step=1,label=i18n("总训练轮数total_epoch"),value=15,interactive=True)
-                    if_save_latest1Bb = gr.Checkbox(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), value=True, interactive=True, show_label=True)
-                    if_save_every_weights1Bb = gr.Checkbox(label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), value=True, interactive=True, show_label=True)
-                    save_every_epoch1Bb = gr.Slider(minimum=1,maximum=50,step=1,label=i18n("保存频率save_every_epoch"),value=5,interactive=True)
-                    gpu_numbers1Bb = gr.Textbox(label="GPU卡号以-分割，每个卡号一个进程", value="%s" % (gpus), interactive=True)
-                with gr.Row():
-                    button1Bb_open = gr.Button("开启GPT训练", variant="primary",visible=True)
-                    button1Bb_close = gr.Button("终止GPT训练", variant="primary",visible=False)
-                    info1Bb=gr.Textbox(label="GPT训练进程输出信息")
-            button1Ba_open.click(open1Ba, [batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D], [info1Ba,button1Ba_open,button1Ba_close])
-            button1Ba_close.click(close1Ba, [], [info1Ba,button1Ba_open,button1Ba_close])
-            button1Bb_open.click(open1Bb, [batch_size1Bb,total_epoch1Bb,exp_name,if_save_latest1Bb,if_save_every_weights1Bb,save_every_epoch1Bb,gpu_numbers1Bb,pretrained_s1],   [info1Bb,button1Bb_open,button1Bb_close])
-            button1Bb_close.click(close1Bb, [], [info1Bb,button1Bb_open,button1Bb_close])
-            with gr.TabItem("1C-推理"):
-                gr.Markdown(value="选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模，体验5秒Zero Shot TTS用。")
-                with gr.Row():
-                    GPT_dropdown = gr.Dropdown(label="*GPT模型列表", choices=sorted(GPT_names),value=pretrained_gpt_name)
-                    SoVITS_dropdown = gr.Dropdown(label="*SoVITS模型列表", choices=sorted(SoVITS_names),value=pretrained_sovits_name)
-                    gpu_number_1C=gr.Textbox(label="GPU卡号,只能填1个整数", value=gpus, interactive=True)
-                    refresh_button = gr.Button("刷新模型路径", variant="primary")
-                    refresh_button.click(fn=change_choices,inputs=[],outputs=[SoVITS_dropdown,GPT_dropdown])
-                with gr.Row():
-                    if_tts = gr.Checkbox(label="是否开启TTS推理WebUI", show_label=True)
-                    tts_info = gr.Textbox(label="TTS推理WebUI进程输出信息")
-                    if_tts.change(change_tts_inference, [if_tts,bert_pretrained_dir,cnhubert_base_dir,gpu_number_1C,GPT_dropdown,SoVITS_dropdown], [tts_info])
-        with gr.TabItem("2-GPT-SoVITS-变声"):gr.Markdown(value="施工中，请静候佳音")
-    '''
-            os.environ["gpt_path"]=gpt_path
-            os.environ["sovits_path"]=sovits_path#bert_pretrained_dir
-            os.environ["cnhubert_base_path"]=cnhubert_base_path#cnhubert_base_dir
-            os.environ["bert_path"]=bert_path
-            os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
-    '''
-    app.queue(concurrency_count=511, max_size=1022).launch(
-        share=True,
-        server_name="0.0.0.0",
-        inbrowser=True,
-        server_port=7890,
-        quiet=True,
-    )

GPT-SoVITS-models/GPT-SoVITS/.ipynb_checkpoints/启动webui-checkpoint.sh DELETED Viewed

	@@ -1,2 +0,0 @@
1	- #!/bin/bash
2	- python /root/autodl-tmp/workdir/GPT-SoVITS/webui.py

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/.ipynb_checkpoints/inference_webui-checkpoint.py DELETED Viewed

@@ -1,270 +0,0 @@
-import os
-gpt_path=os.environ.get("gpt_path","pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
-sovits_path=os.environ.get("sovits_path","pretrained_models/s2G488k.pth")
-cnhubert_base_path=os.environ.get("cnhubert_base_path","pretrained_models/chinese-hubert-base")
-bert_path=os.environ.get("bert_path","pretrained_models/chinese-roberta-wwm-ext-large")
-if("_CUDA_VISIBLE_DEVICES"in os.environ):
-    os.environ["CUDA_VISIBLE_DEVICES"]=os.environ["_CUDA_VISIBLE_DEVICES"]
-is_half=eval(os.environ.get("is_half","True"))
-import gradio as gr
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-import sys,torch,numpy as np
-from pathlib import Path
-import os,pdb,utils,librosa,math,traceback,requests,argparse,torch,multiprocessing,pandas as pd,torch.multiprocessing as mp,soundfile
-# torch.backends.cuda.sdp_kernel("flash")
-# torch.backends.cuda.enable_flash_sdp(True)
-# torch.backends.cuda.enable_mem_efficient_sdp(True)  # Not avaliable if torch version is lower than 2.0
-# torch.backends.cuda.enable_math_sdp(True)
-from random import shuffle
-from AR.utils import get_newest_ckpt
-from glob import glob
-from tqdm import tqdm
-from feature_extractor import cnhubert
-cnhubert.cnhubert_base_path=cnhubert_base_path
-from io import BytesIO
-from module.models import SynthesizerTrn
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
-from AR.utils.io import load_yaml_config
-from text import cleaned_text_to_sequence
-from text.cleaner import text_to_sequence, clean_text
-from time import time as ttime
-from module.mel_processing import spectrogram_torch
-from my_utils import load_audio
-device="cuda"
-tokenizer = AutoTokenizer.from_pretrained(bert_path)
-bert_model=AutoModelForMaskedLM.from_pretrained(bert_path)
-if(is_half==True):bert_model=bert_model.half().to(device)
-else:bert_model=bert_model.to(device)
-# bert_model=bert_model.to(device)
-def get_bert_feature(text, word2ph):
-    with torch.no_grad():
-        inputs = tokenizer(text, return_tensors="pt")
-        for i in inputs:
-            inputs[i] = inputs[i].to(device)#####输入是long不用管精度问题，精度随bert_model
-        res = bert_model(**inputs, output_hidden_states=True)
-        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
-    assert len(word2ph) == len(text)
-    phone_level_feature = []
-    for i in range(len(word2ph)):
-        repeat_feature = res[i].repeat(word2ph[i], 1)
-        phone_level_feature.append(repeat_feature)
-    phone_level_feature = torch.cat(phone_level_feature, dim=0)
-    # if(is_half==True):phone_level_feature=phone_level_feature.half()
-    return phone_level_feature.T
-n_semantic = 1024
-dict_s2=torch.load(sovits_path,map_location="cpu")
-hps=dict_s2["config"]
-class DictToAttrRecursive:
-    def __init__(self, input_dict):
-        for key, value in input_dict.items():
-            if isinstance(value, dict):
-                # 如果值是字典，递归调用构造函数
-                setattr(self, key, DictToAttrRecursive(value))
-            else:
-                setattr(self, key, value)
-hps = DictToAttrRecursive(hps)
-hps.model.semantic_frame_rate="25hz"
-dict_s1=torch.load(gpt_path,map_location="cpu")
-config=dict_s1["config"]
-ssl_model=cnhubert.get_model()
-if(is_half==True):ssl_model=ssl_model.half().to(device)
-else:ssl_model=ssl_model.to(device)
-vq_model = SynthesizerTrn(
-    hps.data.filter_length // 2 + 1,
-    hps.train.segment_size // hps.data.hop_length,
-    n_speakers=hps.data.n_speakers,
-    **hps.model)
-if(is_half==True):vq_model=vq_model.half().to(device)
-else:vq_model=vq_model.to(device)
-vq_model.eval()
-print(vq_model.load_state_dict(dict_s2["weight"],strict=False))
-hz = 50
-max_sec = config['data']['max_sec']
-# t2s_model = Text2SemanticLightningModule.load_from_checkpoint(checkpoint_path=gpt_path, config=config, map_location="cpu")#########todo
-t2s_model = Text2SemanticLightningModule(config,"ojbk",is_train=False)
-t2s_model.load_state_dict(dict_s1["weight"])
-if(is_half==True):t2s_model=t2s_model.half()
-t2s_model=t2s_model.to(device)
-t2s_model.eval()
-total = sum([param.nelement() for param in t2s_model.parameters()])
-print("Number of parameter: %.2fM" % (total / 1e6))
-def get_spepc(hps, filename):
-    audio=load_audio(filename,int(hps.data.sampling_rate))
-    audio=torch.FloatTensor(audio)
-    audio_norm = audio
-    audio_norm = audio_norm.unsqueeze(0)
-    spec = spectrogram_torch(audio_norm, hps.data.filter_length,hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,center=False)
-    return spec
-dict_language={
-    "中文":"zh",
-    "英文":"en",
-    "日文":"ja"
-}
-def get_tts_wav(ref_wav_path,prompt_text,prompt_language,text,text_language):
-    t0 = ttime()
-    prompt_text=prompt_text.strip("\n")
-    prompt_language,text=prompt_language,text.strip("\n")
-    with torch.no_grad():
-        wav16k, sr = librosa.load(ref_wav_path, sr=16000)  # 派蒙
-        wav16k = torch.from_numpy(wav16k)
-        if(is_half==True):wav16k=wav16k.half().to(device)
-        else:wav16k=wav16k.to(device)
-        ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)#.float()
-        codes = vq_model.extract_latent(ssl_content)
-        prompt_semantic = codes[0, 0]
-    t1 = ttime()
-    prompt_language=dict_language[prompt_language]
-    text_language=dict_language[text_language]
-    phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
-    phones1=cleaned_text_to_sequence(phones1)
-    texts=text.split("\n")
-    audio_opt = []
-    zero_wav=np.zeros(int(hps.data.sampling_rate*0.3),dtype=np.float16 if is_half==True else np.float32)
-    for text in texts:
-        phones2, word2ph2, norm_text2 = clean_text(text, text_language)
-        phones2 = cleaned_text_to_sequence(phones2)
-        if(prompt_language=="zh"):bert1 = get_bert_feature(norm_text1, word2ph1)
-        else:bert1 = torch.zeros((1024, len(phones1)),dtype=torch.float16 if is_half==True else torch.float32).to(device)
-        if(text_language=="zh"):bert2 = get_bert_feature(norm_text2, word2ph2)
-        else:bert2 = torch.zeros((1024, len(phones2))).to(bert1)
-        bert = torch.cat([bert1, bert2], 1)
-        all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
-        bert = bert.to(device).unsqueeze(0)
-        all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
-        prompt = prompt_semantic.unsqueeze(0).to(device)
-        t2 = ttime()
-        with torch.no_grad():
-            # pred_semantic = t2s_model.model.infer(
-            pred_semantic,idx = t2s_model.model.infer_panel(
-                all_phoneme_ids,
-                all_phoneme_len,
-                prompt,
-                bert,
-                # prompt_phone_len=ph_offset,
-                top_k=config['inference']['top_k'],
-                early_stop_num=hz * max_sec)
-        t3 = ttime()
-        # print(pred_semantic.shape,idx)
-        pred_semantic = pred_semantic[:,-idx:].unsqueeze(0)  # .unsqueeze(0)#mq要多unsqueeze一次
-        refer = get_spepc(hps, ref_wav_path)#.to(device)
-        if(is_half==True):refer=refer.half().to(device)
-        else:refer=refer.to(device)
-        # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
-        audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer).detach().cpu().numpy()[0, 0]###试试重建不带上prompt部分
-        audio_opt.append(audio)
-        audio_opt.append(zero_wav)
-        t4 = ttime()
-    print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
-    yield hps.data.sampling_rate,(np.concatenate(audio_opt,0)*32768).astype(np.int16)
-splits={"，","。","？","！",",",".","?","!","~",":","：","—","…",}#不考虑省略号
-def split(todo_text):
-    todo_text = todo_text.replace("……", "。").replace("——", "，")
-    if (todo_text[-1] not in splits): todo_text += "。"
-    i_split_head = i_split_tail = 0
-    len_text = len(todo_text)
-    todo_texts = []
-    while (1):
-        if (i_split_head >= len_text): break  # 结尾一定有标点，所以直接跳出即可，最后一段在上次已加入
-        if (todo_text[i_split_head] in splits):
-            i_split_head += 1
-            todo_texts.append(todo_text[i_split_tail:i_split_head])
-            i_split_tail = i_split_head
-        else:
-            i_split_head += 1
-    return todo_texts
-def cut1(inp):
-    inp=inp.strip("\n")
-    inps=split(inp)
-    split_idx=list(range(0,len(inps),5))
-    split_idx[-1]=None
-    if(len(split_idx)>1):
-        opts=[]
-        for idx in range(len(split_idx)-1):
-            opts.append("".join(inps[split_idx[idx]:split_idx[idx+1]]))
-    else:
-        opts=[inp]
-    return "\n".join(opts)
-def cut2(inp):
-    inp=inp.strip("\n")
-    inps=split(inp)
-    if(len(inps)<2):return [inp]
-    opts=[]
-    summ=0
-    tmp_str=""
-    for i in range(len(inps)):
-        summ+=len(inps[i])
-        tmp_str+=inps[i]
-        if(summ>50):
-            summ=0
-            opts.append(tmp_str)
-            tmp_str=""
-    if(tmp_str!=""):opts.append(tmp_str)
-    if(len(opts[-1])<50):##如果最后一个太短了，和前一个合一起
-        opts[-2]=opts[-2]+opts[-1]
-        opts=opts[:-1]
-    return "\n".join(opts)
-def cut3(inp):
-    inp=inp.strip("\n")
-    return "\n".join(["%s。"%item for item in inp.strip("。").split("。")])
-with gr.Blocks(title="GPT-SoVITS WebUI") as app:
-    gr.Markdown(
-        value=
-            "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>."
-    )
-    # with gr.Tabs():
-    #     with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
-    with gr.Group():
-        gr.Markdown(
-            value=
-                "*请上传并填写参考信息"
-        )
-        with gr.Row():
-            inp_ref = gr.Audio(label="请上传参考音频", type="filepath")
-            prompt_text= gr.Textbox(label="参考音频的文本",value="")
-            prompt_language= gr.Dropdown(label="参考音频的语种",choices=["中文","英文","日文"])
-        gr.Markdown(
-            value=
-                "*请填写需要合成的目标文本"
-        )
-        with gr.Row():
-            text=gr.Textbox(label="需要合成的文本",value="")
-            text_language = gr.Dropdown(label="需要合成的语种", choices=["中文", "英文", "日文"])
-            inference_button=gr.Button("合成语音", variant="primary")
-            output = gr.Audio(label="输出的语音")
-        inference_button.click(get_tts_wav, [inp_ref, prompt_text,prompt_language, text,text_language], [output])
-        gr.Markdown(
-            value=
-                "文本切分工具。太长的文本合成出来效果不一定好，所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"
-        )
-        with gr.Row():
-            text_inp=gr.Textbox(label="需要合成的切分前文本",value="")
-            button1 = gr.Button("凑五句一切", variant="primary")
-            button2 = gr.Button("凑50字一切", variant="primary")
-            button3 = gr.Button("按中文句号。切", variant="primary")
-            text_opt = gr.Textbox(label="切分后文本", value="")
-            button1.click(cut1,[text_inp],[text_opt])
-            button2.click(cut2,[text_inp],[text_opt])
-            button3.click(cut3,[text_inp],[text_opt])
-        gr.Markdown(
-            value=
-                "后续将支持混合语种编码文本输入。"
-        )
-app.queue(concurrency_count=511, max_size=1022).launch(
-    server_name="0.0.0.0",
-    inbrowser=True,
-    server_port=6006,
-    quiet=True,
-)

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/__init__.py DELETED Viewed

File without changes

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/data/__init__.py DELETED Viewed

File without changes

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/data/bucket_sampler.py DELETED Viewed

@@ -1,157 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/bucketsampler.py
-import itertools
-import math
-import random
-from random import shuffle
-from typing import Iterator
-from typing import Optional
-from typing import TypeVar
-import torch
-import torch.distributed as dist
-from torch.utils.data import Dataset
-from torch.utils.data import Sampler
-__all__ = [
-    "DistributedBucketSampler",
-]
-T_co = TypeVar('T_co', covariant=True)
-class DistributedBucketSampler(Sampler[T_co]):
-    r"""
-    sort the dataset wrt. input length
-    divide samples into buckets
-    sort within buckets
-    divide buckets into batches
-    sort batches
-    """
-    def __init__(self,
-                 dataset: Dataset,
-                 num_replicas: Optional[int]=None,
-                 rank: Optional[int]=None,
-                 shuffle: bool=True,
-                 seed: int=0,
-                 drop_last: bool=False,
-                 batch_size: int=32) -> None:
-        if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError(
-                    "Requires distributed package to be available")
-            num_replicas = dist.get_world_size()
-        if rank is None:
-            if not dist.is_available():
-                raise RuntimeError(
-                    "Requires distributed package to be available")
-            rank = dist.get_rank()
-            torch.cuda.set_device(rank)
-        if rank >= num_replicas or rank < 0:
-            raise ValueError("Invalid rank {}, rank should be in the interval"
-                             " [0, {}]".format(rank, num_replicas - 1))
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.epoch = 0
-        self.drop_last = drop_last
-        # If the dataset length is evenly divisible by # of replicas, then there
-        # is no need to drop any data, since the dataset will be split equally.
-        if self.drop_last and len(
-                self.
-                dataset) % self.num_replicas != 0:  # type: ignore[arg-type]
-            # Split to nearest available length that is evenly divisible.
-            # This is to ensure each rank receives the same amount of data when
-            # using this Sampler.
-            self.num_samples = math.ceil(
-                (len(self.dataset) - self.num_replicas) /
-                self.num_replicas  # type: ignore[arg-type]
-            )
-        else:
-            self.num_samples = math.ceil(
-                len(self.dataset) / self.num_replicas)  # type: ignore[arg-type]
-        self.total_size = self.num_samples * self.num_replicas
-        self.shuffle = shuffle
-        self.seed = seed
-        self.batch_size = batch_size
-        self.id_with_length = self._get_sample_lengths()
-        self.id_buckets = self.make_buckets(bucket_width=2.0)
-    def _get_sample_lengths(self):
-        id_with_lengths = []
-        for i in range(len(self.dataset)):
-            id_with_lengths.append((i, self.dataset.get_sample_length(i)))
-        id_with_lengths.sort(key=lambda x: x[1])
-        return id_with_lengths
-    def make_buckets(self, bucket_width: float=2.0):
-        buckets = []
-        cur = []
-        max_sec = bucket_width
-        for id, sec in self.id_with_length:
-            if sec < max_sec:
-                cur.append(id)
-            else:
-                buckets.append(cur)
-                cur = [id]
-                max_sec += bucket_width
-        if len(cur) > 0:
-            buckets.append(cur)
-        return buckets
-    def __iter__(self) -> Iterator[T_co]:
-        if self.shuffle:
-            # deterministically shuffle based on epoch and seed
-            g = torch.Generator()
-            g.manual_seed(self.seed + self.epoch)
-            random.seed(self.epoch + self.seed)
-            shuffled_bucket = []
-            for buc in self.id_buckets:
-                buc_copy = buc.copy()
-                shuffle(buc_copy)
-                shuffled_bucket.append(buc_copy)
-            grouped_batch_size = self.batch_size * self.num_replicas
-            shuffled_bucket = list(itertools.chain(*shuffled_bucket))
-            n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size))
-            batches = [
-                shuffled_bucket[b * grouped_batch_size:(b + 1) *
-                                grouped_batch_size] for b in range(n_batch)
-            ]
-            shuffle(batches)
-            indices = list(itertools.chain(*batches))
-        else:
-            # type: ignore[arg-type]
-            indices = list(range(len(self.dataset)))
-        if not self.drop_last:
-            # add extra samples to make it evenly divisible
-            padding_size = self.total_size - len(indices)
-            if padding_size <= len(indices):
-                indices += indices[:padding_size]
-            else:
-                indices += (indices * math.ceil(padding_size /
-                                                len(indices)))[:padding_size]
-        else:
-            # remove tail of data to make it evenly divisible.
-            indices = indices[:self.total_size]
-        assert len(indices) == self.total_size
-        # subsample
-        indices = indices[self.rank:self.total_size:self.num_replicas]
-        assert len(indices) == self.num_samples
-        return iter(indices)
-    def __len__(self) -> int:
-        return self.num_samples
-    def set_epoch(self, epoch: int) -> None:
-        r"""
-        Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
-        use a different random ordering for each epoch. Otherwise, the next iteration of this
-        sampler will yield the same ordering.
-        Args:
-            epoch (int): Epoch number.
-        """
-        self.epoch = epoch

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/data/data_module.py DELETED Viewed

@@ -1,66 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/data_module.py
-from pytorch_lightning import LightningDataModule
-from AR.data.bucket_sampler import DistributedBucketSampler
-from AR.data.dataset import Text2SemanticDataset
-from torch.utils.data import DataLoader
-class Text2SemanticDataModule(LightningDataModule):
-    def __init__(self, config, train_semantic_path, train_phoneme_path,dev_semantic_path=None, dev_phoneme_path=None):
-        super().__init__()
-        self.config = config
-        self.train_semantic_path = train_semantic_path
-        self.train_phoneme_path = train_phoneme_path
-        self.dev_semantic_path = dev_semantic_path
-        self.dev_phoneme_path = dev_phoneme_path
-        self.num_workers = self.config['data']['num_workers']
-    def prepare_data(self):
-        pass
-    def setup(self, stage=None, output_logs=False):
-        self._train_dataset = Text2SemanticDataset(
-            phoneme_path=self.train_phoneme_path,
-            semantic_path=self.train_semantic_path,
-            max_sec=self.config['data']['max_sec'],
-            pad_val=self.config['data']['pad_val'])
-        self._dev_dataset = self._train_dataset
-        # self._dev_dataset = Text2SemanticDataset(
-        #     phoneme_path=self.dev_phoneme_path,
-        #     semantic_path=self.dev_semantic_path,
-        #     max_sample=self.config['data']['max_eval_sample'],
-        #     max_sec=self.config['data']['max_sec'],
-        #     pad_val=self.config['data']['pad_val'])
-    def train_dataloader(self):
-        batch_size = self.config['train']['batch_size']
-        sampler = DistributedBucketSampler(
-            self._train_dataset, batch_size=batch_size)
-        return DataLoader(
-            self._train_dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            collate_fn=self._train_dataset.collate,
-            num_workers=self.num_workers,
-            persistent_workers=True,
-            prefetch_factor=16
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            self._dev_dataset,
-            batch_size=1,
-            shuffle=False,
-            collate_fn=self._train_dataset.collate,
-            num_workers=max(self.num_workers,12),
-            persistent_workers=True,
-            prefetch_factor=16
-        )
-    # 这个会使用到嘛？
-    def test_dataloader(self):
-        return DataLoader(
-            self._dev_dataset,
-            batch_size=1,
-            shuffle=False,
-            collate_fn=self._train_dataset.collate)

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/data/dataset.py DELETED Viewed

@@ -1,302 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/t2s_dataset.py
-import pdb
-import sys
-# sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert")
-import traceback,os
-from typing import Dict
-from typing import List
-import numpy as np
-import pandas as pd
-import torch,json
-from torch.utils.data import DataLoader
-from torch.utils.data import Dataset
-from transformers import AutoTokenizer
-from text import cleaned_text_to_sequence
-# from config import exp_dir
-def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0):
-    seq = sequences[0]
-    ndim = seq.ndim
-    if axis < 0:
-        axis += ndim
-    dtype = seq.dtype
-    pad_value = dtype.type(pad_value)
-    seq_lengths = [seq.shape[axis] for seq in sequences]
-    max_length = np.max(seq_lengths)
-    padded_sequences = []
-    for seq, length in zip(sequences, seq_lengths):
-        padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
-                ndim - axis - 1)
-        padded_seq = np.pad(
-            seq, padding, mode='constant', constant_values=pad_value)
-        padded_sequences.append(padded_seq)
-    batch = np.stack(padded_sequences)
-    return batch
-class Text2SemanticDataset(Dataset):
-    """dataset class for text tokens to semantic model training."""
-    def __init__(self,
-                 phoneme_path: str,
-                 semantic_path: str,
-                 max_sample: int = None,
-                 max_sec: int = 100,
-                 pad_val: int = 1024,
-                 # min value of phoneme/sec
-                 min_ps_ratio: int = 3,
-                 # max value of phoneme/sec
-                 max_ps_ratio: int = 25) -> None:
-        super().__init__()
-        self.semantic_data = pd.read_csv(semantic_path, delimiter='\t', encoding="utf-8")
-        # get dict
-        self.path2=phoneme_path#"%s/2-name2text.txt"%exp_dir#phoneme_path
-        self.path3="%s/3-bert"%(os.path.basename(phoneme_path))#"%s/3-bert"%exp_dir#bert_dir
-        self.path6=semantic_path#"%s/6-name2semantic.tsv"%exp_dir#semantic_path
-        assert os.path.exists(self.path2)
-        assert os.path.exists(self.path6)
-        self.phoneme_data={}
-        with open(self.path2,"r",encoding="utf8")as f:
-            lines=f.read().strip("\n").split("\n")
-        for line in lines:
-            tmp=line.split("\t")
-            if(len(tmp)!=4):continue
-            self.phoneme_data[tmp[0]]=[tmp[1],tmp[2],tmp[3]]
-        # self.phoneme_data = np.load(phoneme_path, allow_pickle=True).item()
-        # pad for semantic tokens
-        self.PAD: int = pad_val
-        # self.hz = 25
-        # with open("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert/configs/s2.json", "r") as f:data = f.read()
-        # data=json.loads(data)["model"]["semantic_frame_rate"]#50hz
-        # self.hz=int(data[:-2])#
-        self.hz=int(os.environ.get("hz","25hz")[:-2])
-        # max seconds of semantic token
-        self.max_sec = max_sec
-        self.min_ps_ratio = min_ps_ratio
-        self.max_ps_ratio = max_ps_ratio
-        if max_sample is not None:
-            self.semantic_data = self.semantic_data[:max_sample]
-        # {idx: (semantic, phoneme)}
-        # semantic list, phoneme list
-        self.semantic_phoneme = []
-        self.item_names = []
-        self.inited = False
-        if not self.inited:
-            # 调用初始化函数
-            self.init_batch()
-            self.inited = True
-            del self.semantic_data
-            del self.phoneme_data
-        # self.tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large")
-        # self.tokenizer = AutoTokenizer.from_pretrained("/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large")
-    def init_batch(self):
-        semantic_data_len = len(self.semantic_data)
-        phoneme_data_len = len(self.phoneme_data.keys())
-        print("semantic_data_len:", semantic_data_len)
-        print("phoneme_data_len:", phoneme_data_len)
-        idx = 0
-        num_not_in = 0
-        num_deleted_bigger = 0
-        num_deleted_ps = 0
-        for i in range(semantic_data_len):
-            # 先依次遍历
-            # get str
-            item_name = self.semantic_data['item_name'][i]
-            # print(self.phoneme_data)
-            try:
-                phoneme, word2ph, text = self.phoneme_data[item_name]
-            except Exception:
-                traceback.print_exc()
-                # print(f"{item_name} not in self.phoneme_data !")
-                num_not_in += 1
-                continue
-            semantic_str = self.semantic_data['semantic_audio'][i]
-            # get token list
-            semantic_ids = [int(idx) for idx in semantic_str.split(' ')]
-            # (T), 是否需要变成 (1, T) -> 不需要，因为需要求 len
-            # 过滤掉太长的样本
-            if len(semantic_ids) > self.max_sec * self.hz:#########1###根据token���数推测总时长过滤时长60s（config里）#40*25=1k
-                num_deleted_bigger += 1
-                continue
-            # (T, ), 这个速度不会很慢，所以可以在一开始就处理，无需在 __getitem__ 里面单个处理####
-            phoneme = phoneme.split(' ')
-            try:
-                phoneme_ids = cleaned_text_to_sequence(phoneme)
-            except:
-                traceback.print_exc()
-                # print(f"{item_name} not in self.phoneme_data !")
-                num_not_in += 1
-                continue
-            # if len(phoneme_ids) >400:###########2：改为恒定限制为semantic/2.5就行
-            if len(phoneme_ids) >self.max_sec * self.hz/2.5:###########2：改为恒定限制为semantic/2.5就行
-                num_deleted_ps += 1
-                continue
-            # if len(semantic_ids) > 1000:###########3
-            #     num_deleted_bigger += 1
-            #     continue
-            ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz)
-            if ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio:##########4#3~25#每秒多少个phone
-                num_deleted_ps += 1
-                # print(item_name)
-                continue
-            self.semantic_phoneme.append((semantic_ids, phoneme_ids))
-            idx += 1
-            self.item_names.append(item_name)
-        min_num=100#20直接不补#30补了也不存ckpt
-        leng =len(self.semantic_phoneme)
-        if(leng<min_num):
-            tmp1=self.semantic_phoneme
-            tmp2=self.item_names
-            self.semantic_phoneme=[]
-            self.item_names=[]
-            for _ in range(max(2,int(min_num/leng))):
-                self.semantic_phoneme+=tmp1
-                self.item_names+=tmp2
-        if num_not_in > 0:
-            print(f"there are {num_not_in} semantic datas not in phoneme datas")
-        if num_deleted_bigger > 0:
-            print(
-                f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds"
-            )
-        if num_deleted_ps > 0:
-            # 4702 for LibriTTS, LirbriTTS 是标注数据, 是否需要筛？=> 需要，有值为 100 的极端值
-            print(
-                f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}"
-            )
-        '''
-        there are 31 semantic datas not in phoneme datas
-        deleted 34 audios who's duration are bigger than 54 seconds
-        deleted 3190 audios who's phoneme/sec are bigger than 25 or smaller than 3
-        dataset.__len__(): 366463
-        '''
-        # 345410 for LibriTTS
-        print("dataset.__len__():", self.__len__())
-    def __get_item_names__(self) -> List[str]:
-        return self.item_names
-    def __len__(self) -> int:
-        return len(self.semantic_phoneme)
-    def __getitem__(self, idx: int) -> Dict:
-        semantic_ids, phoneme_ids = self.semantic_phoneme[idx]
-        item_name = self.item_names[idx]
-        phoneme_ids_len = len(phoneme_ids)
-        # semantic tokens target
-        semantic_ids_len = len(semantic_ids)
-        flag=0
-        path_bert = "%s/%s.pt" % (self.path3, item_name)
-        if(os.path.exists(path_bert)==True):bert_feature = torch.load(path_bert,map_location="cpu")
-        else:flag=1
-        if(flag==1):
-            # bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32)
-            bert_feature=None
-        else:
-            assert bert_feature.shape[-1] == len(phoneme_ids)
-        return {
-            'idx': idx,
-            'phoneme_ids': phoneme_ids,
-            'phoneme_ids_len': phoneme_ids_len,
-            'semantic_ids': semantic_ids,
-            'semantic_ids_len': semantic_ids_len,
-            'bert_feature': bert_feature,
-        }
-    def get_sample_length(self, idx: int):
-        semantic_ids = self.semantic_phoneme[idx][0]
-        sec = 1.0 * len(semantic_ids) / self.hz
-        return sec
-    def collate(self, examples: List[Dict]) -> Dict:
-        sample_index: List[int] = []
-        phoneme_ids: List[torch.Tensor] = []
-        phoneme_ids_lens: List[int] = []
-        semantic_ids: List[torch.Tensor] = []
-        semantic_ids_lens: List[int] = []
-        # return
-        for item in examples:
-            sample_index.append(item["idx"])
-            phoneme_ids.append(np.array(item["phoneme_ids"], dtype=np.int64))
-            semantic_ids.append(np.array(item["semantic_ids"], dtype=np.int64))
-            phoneme_ids_lens.append(item["phoneme_ids_len"])
-            semantic_ids_lens.append(item["semantic_ids_len"])
-        # pad 0
-        phoneme_ids = batch_sequences(phoneme_ids)
-        semantic_ids = batch_sequences(semantic_ids, pad_value=self.PAD)
-        # # convert each batch to torch.tensor
-        phoneme_ids = torch.tensor(phoneme_ids)
-        semantic_ids = torch.tensor(semantic_ids)
-        phoneme_ids_lens = torch.tensor(phoneme_ids_lens)
-        semantic_ids_lens = torch.tensor(semantic_ids_lens)
-        bert_padded = torch.FloatTensor(len(examples), 1024, max(phoneme_ids_lens))
-        bert_padded.zero_()
-        for idx, item in enumerate(examples):
-            bert = item['bert_feature']
-            if(bert!=None):
-                bert_padded[idx, :, :bert.shape[-1]] = bert
-        return {
-            # List[int]
-            "ids": sample_index,
-            # torch.Tensor (B, max_phoneme_length)
-            "phoneme_ids": phoneme_ids,
-            # torch.Tensor (B)
-            "phoneme_ids_len": phoneme_ids_lens,
-            # torch.Tensor (B, max_semantic_ids_length)
-            "semantic_ids": semantic_ids,
-            # torch.Tensor (B)
-            "semantic_ids_len": semantic_ids_lens,
-            # torch.Tensor (B, 1024, max_phoneme_length)
-            "bert_feature": bert_padded,
-        }
-if __name__ == '__main__':
-    root_dir = '/data/docker/liujing04/gpt-vits/prepare/dump_mix/'
-    dataset = Text2SemanticDataset(
-        phoneme_path=root_dir + 'phoneme_train.npy',
-        semantic_path=root_dir + 'semantic_train.tsv')
-    batch_size = 12
-    dataloader = DataLoader(
-        dataset,
-        batch_size=batch_size,
-        collate_fn=dataset.collate,
-        shuffle=False)
-    for i, batch in enumerate(dataloader):
-        if(i%1000==0):print(i)
-        # if i == 0:
-        #     print('batch["ids"]:', batch["ids"])
-            # print('batch["phoneme_ids"]:', batch["phoneme_ids"],
-            #       batch["phoneme_ids"].shape)
-            # print('batch["phoneme_ids_len"]:', batch["phoneme_ids_len"],
-            #       batch["phoneme_ids_len"].shape)
-            # print('batch["semantic_ids"]:', batch["semantic_ids"],
-            #       batch["semantic_ids"].shape)
-            # print('batch["semantic_ids_len"]:', batch["semantic_ids_len"],
-            #       batch["semantic_ids_len"].shape)

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/__init__.py DELETED Viewed

File without changes

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/BEATs.py DELETED Viewed

@@ -1,179 +0,0 @@
-# --------------------------------------------------------
-# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
-# Github source: https://github.com/microsoft/unilm/tree/master/beats
-# Copyright (c) 2022 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Based on fairseq code bases
-# https://github.com/pytorch/fairseq
-# --------------------------------------------------------
-import logging
-from typing import Optional
-import torch
-import torch.nn as nn
-import torchaudio.compliance.kaldi as ta_kaldi
-from torch.nn import LayerNorm
-from .backbone import TransformerEncoder
-logger = logging.getLogger(__name__)
-class BEATsConfig:
-    def __init__(self, cfg=None):
-        self.input_patch_size: int = -1  # path size of patch embedding
-        self.embed_dim: int = 512  # patch embedding dimension
-        self.conv_bias: bool = False  # include bias in conv encoder
-        self.encoder_layers: int = 12  # num encoder layers in the transformer
-        self.encoder_embed_dim: int = 768  # encoder embedding dimension
-        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
-        self.encoder_attention_heads: int = 12  # num encoder attention heads
-        self.activation_fn: str = "gelu"  # activation function to use
-        self.layer_wise_gradient_decay_ratio: float = 1.0  # ratio for layer-wise gradient decay
-        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
-        self.deep_norm: bool = False  # apply deep_norm first in the transformer
-        # dropouts
-        self.dropout: float = 0.1  # dropout probability for the transformer
-        self.attention_dropout: float = 0.1  # dropout probability for attention weights
-        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
-        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
-        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
-        # positional embeddings
-        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
-        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
-        # relative position embedding
-        self.relative_position_embedding: bool = False  # apply relative position embedding
-        self.num_buckets: int = 320  # number of buckets for relative position embedding
-        self.max_distance: int = 1280  # maximum distance for relative position embedding
-        self.gru_rel_pos: bool = False  # apply gated relative position embedding
-        # label predictor
-        self.finetuned_model: bool = False  # whether the model is a fine-tuned model.
-        self.predictor_dropout: float = 0.1  # dropout probability for the predictor
-        self.predictor_class: int = 527  # target class number for the predictor
-        if cfg is not None:
-            self.update(cfg)
-    def update(self, cfg: dict):
-        self.__dict__.update(cfg)
-class BEATs(nn.Module):
-    def __init__(
-            self,
-            cfg: BEATsConfig, ) -> None:
-        super().__init__()
-        logger.info(f"BEATs Config: {cfg.__dict__}")
-        self.cfg = cfg
-        self.embed = cfg.embed_dim
-        self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
-                                  if self.embed != cfg.encoder_embed_dim else
-                                  None)
-        self.input_patch_size = cfg.input_patch_size
-        self.patch_embedding = nn.Conv2d(
-            1,
-            self.embed,
-            kernel_size=self.input_patch_size,
-            stride=self.input_patch_size,
-            bias=cfg.conv_bias)
-        self.dropout_input = nn.Dropout(cfg.dropout_input)
-        assert not cfg.deep_norm or not cfg.layer_norm_first
-        self.encoder = TransformerEncoder(cfg)
-        self.layer_norm = LayerNorm(self.embed)
-        if cfg.finetuned_model:
-            self.predictor_dropout = nn.Dropout(cfg.predictor_dropout)
-            self.predictor = nn.Linear(cfg.encoder_embed_dim,
-                                       cfg.predictor_class)
-        else:
-            self.predictor = None
-    def forward_padding_mask(
-            self,
-            features: torch.Tensor,
-            padding_mask: torch.Tensor, ) -> torch.Tensor:
-        extra = padding_mask.size(1) % features.size(1)
-        if extra > 0:
-            padding_mask = padding_mask[:, :-extra]
-        padding_mask = padding_mask.view(
-            padding_mask.size(0), features.size(1), -1)
-        padding_mask = padding_mask.all(-1)
-        return padding_mask
-    def preprocess(
-            self,
-            source: torch.Tensor,
-            fbank_mean: float=15.41663,
-            fbank_std: float=6.55582, ) -> torch.Tensor:
-        fbanks = []
-        for waveform in source:
-            waveform = waveform.unsqueeze(0) * 2**15
-            fbank = ta_kaldi.fbank(
-                waveform,
-                num_mel_bins=128,
-                sample_frequency=16000,
-                frame_length=25,
-                frame_shift=10)
-            fbanks.append(fbank)
-        fbank = torch.stack(fbanks, dim=0)
-        fbank = (fbank - fbank_mean) / (2 * fbank_std)
-        return fbank
-    def extract_features(
-            self,
-            source: torch.Tensor,
-            padding_mask: Optional[torch.Tensor]=None,
-            fbank_mean: float=15.41663,
-            fbank_std: float=6.55582, ):
-        fbank = self.preprocess(
-            source, fbank_mean=fbank_mean, fbank_std=fbank_std)
-        if padding_mask is not None:
-            padding_mask = self.forward_padding_mask(fbank, padding_mask)
-        fbank = fbank.unsqueeze(1)
-        features = self.patch_embedding(fbank)
-        features = features.reshape(features.shape[0], features.shape[1], -1)
-        features = features.transpose(1, 2)
-        features = self.layer_norm(features)
-        if padding_mask is not None:
-            padding_mask = self.forward_padding_mask(features, padding_mask)
-        if self.post_extract_proj is not None:
-            features = self.post_extract_proj(features)
-        x = self.dropout_input(features)
-        x, layer_results = self.encoder(
-            x,
-            padding_mask=padding_mask, )
-        if self.predictor is not None:
-            x = self.predictor_dropout(x)
-            logits = self.predictor(x)
-            if padding_mask is not None and padding_mask.any():
-                logits[padding_mask] = 0
-                logits = logits.sum(dim=1)
-                logits = logits / (~padding_mask).sum(
-                    dim=1).unsqueeze(-1).expand_as(logits)
-            else:
-                logits = logits.mean(dim=1)
-            lprobs = torch.sigmoid(logits)
-            return lprobs, padding_mask
-        else:
-            return x, padding_mask

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/README.md DELETED Viewed

@@ -1,127 +0,0 @@
-# BEATs
-[**BEATs**](https://arxiv.org/abs/2212.09058): **Audio Pre-Training with Acoustic Tokenizers**
-Official PyTorch implementation and pretrained models of BEATs
-## Pre-Trained and Fine-Tuned Tokenizers and Models
-Iterations  | Tokenizer  | Pre-Trained Model | AudioSet Fine-Tuned Model 1 | AudioSet Fine-Tuned Model 2
-|---|---|---|---|---
-Iter1  |  Random Projection | [BEATs_iter1](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter1 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter1 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
-Iter2  |  [Tokenizer_iter2](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter2](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter2 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter2 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
-Iter3  |  [Tokenizer_iter3](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter3 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
-Iter3+  |  [Tokenizer_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter3+ (AS20K) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS20K) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
-Iter3+  |  [Tokenizer_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter3+ (AS2M) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS2M) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
-### Load Tokenizers
-```python
-import torch
-from Tokenizers import TokenizersConfig, Tokenizers
-# load the pre-trained checkpoints
-checkpoint = torch.load('/path/to/tokenizer.pt')
-cfg = TokenizersConfig(checkpoint['cfg'])
-BEATs_tokenizer = Tokenizers(cfg)
-BEATs_tokenizer.load_state_dict(checkpoint['model'])
-BEATs_tokenizer.eval()
-# tokenize the audio and generate the labels
-audio_input_16khz = torch.randn(1, 10000)
-padding_mask = torch.zeros(1, 10000).bool()
-labels = BEATs_tokenizer.extract_labels(audio_input_16khz, padding_mask=padding_mask)
-```
-### Load Pre-Trained Models
-```python
-import torch
-from BEATs import BEATs, BEATsConfig
-# load the pre-trained checkpoints
-checkpoint = torch.load('/path/to/model.pt')
-cfg = BEATsConfig(checkpoint['cfg'])
-BEATs_model = BEATs(cfg)
-BEATs_model.load_state_dict(checkpoint['model'])
-BEATs_model.eval()
-# extract the the audio representation
-audio_input_16khz = torch.randn(1, 10000)
-padding_mask = torch.zeros(1, 10000).bool()
-representation = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]
-```
-### Load Fine-tuned Models
-```python
-import torch
-from BEATs import BEATs, BEATsConfig
-# load the fine-tuned checkpoints
-checkpoint = torch.load('/path/to/model.pt')
-cfg = BEATsConfig(checkpoint['cfg'])
-BEATs_model = BEATs(cfg)
-BEATs_model.load_state_dict(checkpoint['model'])
-BEATs_model.eval()
-# predict the classification probability of each class
-audio_input_16khz = torch.randn(3, 10000)
-padding_mask = torch.zeros(3, 10000).bool()
-probs = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]
-for i, (top5_label_prob, top5_label_idx) in enumerate(zip(*probs.topk(k=5))):
-    top5_label = [checkpoint['label_dict'][label_idx.item()] for label_idx in top5_label_idx]
-    print(f'Top 5 predicted labels of the {i}th audio are {top5_label} with probability of {top5_label_prob}')
-```
-## Evaluation Results
-### Comparing with the SOTA Single Models
-![alt text](Evaluation_Results/Comparing_with_the_SOTA_Single_Models.png)
-### Comparing with the SOTA Ensemble Models
-![alt text](Evaluation_Results/Comparing_with_the_SOTA_Ensemble_Models.png)
-### Comparing Different BEATS Tokenizers
-![alt text](Evaluation_Results/Comparing_Different_BEATS_Tokenizers.png)
-### Comparing Different Pre-Training Targets
-![alt text](Evaluation_Results/Comparing_Different_Pre-Training_Targets.png)
-## License
-This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
-Portions of the source code are based on the [FAIRSEQ](https://github.com/pytorch/fairseq) and [VQGAN](https://github.com/CompVis/taming-transformers) project.
-[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
-### Reference
-If you find our work is useful in your research, please cite the following paper:
-``` latex
-@article{Chen2022beats,
-  title = {BEATs: Audio Pre-Training with Acoustic Tokenizers},
-  author  = {Sanyuan Chen and Yu Wu and Chengyi Wang and Shujie Liu and Daniel Tompkins and Zhuo Chen and Furu Wei},
-  eprint={2212.09058},
-  archivePrefix={arXiv},
-  year={2022}
-}
-```
-### Contact Information
-For help or issues using BEATs models, please submit a GitHub issue.
-For other communications related to  BEATs, please contact Yu Wu (`[email protected]`).

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/Tokenizers.py DELETED Viewed

@@ -1,172 +0,0 @@
-# --------------------------------------------------------
-# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
-# Github source: https://github.com/microsoft/unilm/tree/master/beats
-# Copyright (c) 2022 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Based on fairseq code bases
-# https://github.com/pytorch/fairseq
-# --------------------------------------------------------
-import logging
-from typing import Optional
-import torch
-import torch.nn as nn
-import torchaudio.compliance.kaldi as ta_kaldi
-from backbone import (
-    TransformerEncoder, )
-from quantizer import (
-    NormEMAVectorQuantizer, )
-from torch.nn import LayerNorm
-logger = logging.getLogger(__name__)
-class TokenizersConfig:
-    def __init__(self, cfg=None):
-        self.input_patch_size: int = -1  # path size of patch embedding
-        self.embed_dim: int = 512  # patch embedding dimension
-        self.conv_bias: bool = False  # include bias in conv encoder
-        self.encoder_layers: int = 12  # num encoder layers in the transformer
-        self.encoder_embed_dim: int = 768  # encoder embedding dimension
-        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
-        self.encoder_attention_heads: int = 12  # num encoder attention heads
-        self.activation_fn: str = "gelu"  # activation function to use
-        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
-        self.deep_norm: bool = False  # apply deep_norm first in the transformer
-        # dropouts
-        self.dropout: float = 0.1  # dropout probability for the transformer
-        self.attention_dropout: float = 0.1  # dropout probability for attention weights
-        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
-        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
-        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
-        # positional embeddings
-        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
-        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
-        # relative position embedding
-        self.relative_position_embedding: bool = False  # apply relative position embedding
-        self.num_buckets: int = 320  # number of buckets for relative position embedding
-        self.max_distance: int = 1280  # maximum distance for relative position embedding
-        self.gru_rel_pos: bool = False  # apply gated relative position embedding
-        # quantizer
-        self.quant_n: int = 1024  # codebook number in quantizer
-        self.quant_dim: int = 256  # codebook dimension in quantizer
-        if cfg is not None:
-            self.update(cfg)
-    def update(self, cfg: dict):
-        self.__dict__.update(cfg)
-class Tokenizers(nn.Module):
-    def __init__(
-            self,
-            cfg: TokenizersConfig, ) -> None:
-        super().__init__()
-        logger.info(f"Tokenizers Config: {cfg.__dict__}")
-        self.cfg = cfg
-        self.embed = cfg.embed_dim
-        self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
-                                  if self.embed != cfg.encoder_embed_dim else
-                                  None)
-        self.input_patch_size = cfg.input_patch_size
-        self.patch_embedding = nn.Conv2d(
-            1,
-            self.embed,
-            kernel_size=self.input_patch_size,
-            stride=self.input_patch_size,
-            bias=cfg.conv_bias)
-        self.dropout_input = nn.Dropout(cfg.dropout_input)
-        assert not cfg.deep_norm or not cfg.layer_norm_first
-        self.encoder = TransformerEncoder(cfg)
-        self.layer_norm = LayerNorm(self.embed)
-        self.quantize = NormEMAVectorQuantizer(
-            n_embed=cfg.quant_n,
-            embedding_dim=cfg.quant_dim,
-            beta=1.0,
-            kmeans_init=True,
-            decay=0.99, )
-        self.quant_n = cfg.quant_n
-        self.quantize_layer = nn.Sequential(
-            nn.Linear(cfg.encoder_embed_dim, cfg.encoder_embed_dim),
-            nn.Tanh(),
-            nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim)  # for quantize
-        )
-    def forward_padding_mask(
-            self,
-            features: torch.Tensor,
-            padding_mask: torch.Tensor, ) -> torch.Tensor:
-        extra = padding_mask.size(1) % features.size(1)
-        if extra > 0:
-            padding_mask = padding_mask[:, :-extra]
-        padding_mask = padding_mask.view(
-            padding_mask.size(0), features.size(1), -1)
-        padding_mask = padding_mask.all(-1)
-        return padding_mask
-    def preprocess(
-            self,
-            source: torch.Tensor,
-            fbank_mean: float=15.41663,
-            fbank_std: float=6.55582, ) -> torch.Tensor:
-        fbanks = []
-        for waveform in source:
-            waveform = waveform.unsqueeze(0) * 2**15
-            fbank = ta_kaldi.fbank(
-                waveform,
-                num_mel_bins=128,
-                sample_frequency=16000,
-                frame_length=25,
-                frame_shift=10)
-            fbanks.append(fbank)
-        fbank = torch.stack(fbanks, dim=0)
-        fbank = (fbank - fbank_mean) / (2 * fbank_std)
-        return fbank
-    def extract_labels(
-            self,
-            source: torch.Tensor,
-            padding_mask: Optional[torch.Tensor]=None,
-            fbank_mean: float=15.41663,
-            fbank_std: float=6.55582, ):
-        fbank = self.preprocess(
-            source, fbank_mean=fbank_mean, fbank_std=fbank_std)
-        if padding_mask is not None:
-            padding_mask = self.forward_padding_mask(fbank, padding_mask)
-        fbank = fbank.unsqueeze(1)
-        features = self.patch_embedding(fbank)
-        features = features.reshape(features.shape[0], features.shape[1], -1)
-        features = features.transpose(1, 2)
-        features = self.layer_norm(features)
-        if padding_mask is not None:
-            padding_mask = self.forward_padding_mask(features, padding_mask)
-        if self.post_extract_proj is not None:
-            features = self.post_extract_proj(features)
-        x = self.dropout_input(features)
-        x, layer_results = self.encoder(
-            x,
-            padding_mask=padding_mask, )
-        quantize_input = self.quantize_layer(x)
-        quantize_feature, embed_loss, embed_ind = self.quantize(quantize_input)
-        return embed_ind

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- # this folder is modified from https://github.com/microsoft/unilm/tree/master/beats
2	- # ontology.json is from https://github.com/audioset/ontology/

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/backbone.py DELETED Viewed

@@ -1,791 +0,0 @@
-# --------------------------------------------------------
-# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
-# Github source: https://github.com/microsoft/unilm/tree/master/beats
-# Copyright (c) 2022 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Based on fairseq code bases
-# https://github.com/pytorch/fairseq
-# --------------------------------------------------------
-import math
-from typing import Dict
-from typing import Optional
-from typing import Tuple
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch import Tensor
-from torch.nn import LayerNorm
-from torch.nn import Parameter
-from .modules import get_activation_fn
-from .modules import GLU_Linear
-from .modules import GradMultiply
-from .modules import quant_noise
-from .modules import SamePad
-class TransformerEncoder(nn.Module):
-    def __init__(self, args):
-        super().__init__()
-        self.dropout = args.dropout
-        self.embedding_dim = args.encoder_embed_dim
-        self.pos_conv = nn.Conv1d(
-            self.embedding_dim,
-            self.embedding_dim,
-            kernel_size=args.conv_pos,
-            padding=args.conv_pos // 2,
-            groups=args.conv_pos_groups, )
-        dropout = 0
-        std = math.sqrt(
-            (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
-        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
-        nn.init.constant_(self.pos_conv.bias, 0)
-        self.pos_conv = nn.utils.weight_norm(
-            self.pos_conv, name="weight", dim=2)
-        self.pos_conv = nn.Sequential(self.pos_conv,
-                                      SamePad(args.conv_pos), nn.GELU())
-        if hasattr(args, "relative_position_embedding"):
-            self.relative_position_embedding = args.relative_position_embedding
-            self.num_buckets = args.num_buckets
-            self.max_distance = args.max_distance
-        else:
-            self.relative_position_embedding = False
-            self.num_buckets = 0
-            self.max_distance = 0
-        self.layers = nn.ModuleList([
-            TransformerSentenceEncoderLayer(
-                embedding_dim=self.embedding_dim,
-                ffn_embedding_dim=args.encoder_ffn_embed_dim,
-                num_attention_heads=args.encoder_attention_heads,
-                dropout=self.dropout,
-                attention_dropout=args.attention_dropout,
-                activation_dropout=args.activation_dropout,
-                activation_fn=args.activation_fn,
-                layer_norm_first=args.layer_norm_first,
-                deep_norm=args.deep_norm,
-                has_relative_attention_bias=self.relative_position_embedding,
-                num_buckets=self.num_buckets,
-                max_distance=self.max_distance,
-                gru_rel_pos=args.gru_rel_pos,
-                encoder_layers=args.encoder_layers, )
-            for i in range(args.encoder_layers)
-        ])
-        if self.relative_position_embedding:
-            for i in range(1, args.encoder_layers):
-                del self.layers[i].self_attn.relative_attention_bias
-                self.layers[i].self_attn.relative_attention_bias = self.layers[
-                    0].self_attn.relative_attention_bias
-        self.layer_norm_first = args.layer_norm_first
-        self.layer_norm = LayerNorm(self.embedding_dim)
-        self.layerdrop = args.encoder_layerdrop
-        self.apply(init_bert_params)
-        if args.deep_norm:
-            deep_norm_beta = math.pow(8 * args.encoder_layers, -1 / 4)
-            for i in range(args.encoder_layers):
-                nn.init.xavier_normal_(
-                    self.layers[i].self_attn.k_proj.weight, gain=1)
-                nn.init.xavier_normal_(
-                    self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta)
-                nn.init.xavier_normal_(
-                    self.layers[i].self_attn.q_proj.weight, gain=1)
-                nn.init.xavier_normal_(
-                    self.layers[i].self_attn.out_proj.weight,
-                    gain=deep_norm_beta)
-                nn.init.xavier_normal_(
-                    self.layers[i].fc1.weight, gain=deep_norm_beta)
-                nn.init.xavier_normal_(
-                    self.layers[i].fc2.weight, gain=deep_norm_beta)
-        self.layer_wise_gradient_decay_ratio = getattr(
-            args, "layer_wise_gradient_decay_ratio", 1)
-    def forward(self, x, padding_mask=None, layer=None):
-        x, layer_results = self.extract_features(x, padding_mask, layer)
-        if self.layer_norm_first and layer is None:
-            x = self.layer_norm(x)
-        return x, layer_results
-    def extract_features(self, x, padding_mask=None, tgt_layer=None):
-        if padding_mask is not None:
-            x[padding_mask] = 0
-        x_conv = self.pos_conv(x.transpose(1, 2))
-        x_conv = x_conv.transpose(1, 2)
-        x = x + x_conv
-        if not self.layer_norm_first:
-            x = self.layer_norm(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-        layer_results = []
-        z = None
-        if tgt_layer is not None:
-            layer_results.append((x, z))
-        r = None
-        pos_bias = None
-        for i, layer in enumerate(self.layers):
-            if self.layer_wise_gradient_decay_ratio != 1.0:
-                x = GradMultiply.apply(x, self.layer_wise_gradient_decay_ratio)
-            dropout_probability = np.random.random()
-            if not self.training or (dropout_probability > self.layerdrop):
-                x, z, pos_bias = layer(
-                    x,
-                    self_attn_padding_mask=padding_mask,
-                    need_weights=False,
-                    pos_bias=pos_bias)
-            if tgt_layer is not None:
-                layer_results.append((x, z))
-            if i == tgt_layer:
-                r = x
-                break
-        if r is not None:
-            x = r
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-        return x, layer_results
-class TransformerSentenceEncoderLayer(nn.Module):
-    def __init__(
-            self,
-            embedding_dim: float=768,
-            ffn_embedding_dim: float=3072,
-            num_attention_heads: float=8,
-            dropout: float=0.1,
-            attention_dropout: float=0.1,
-            activation_dropout: float=0.1,
-            activation_fn: str="relu",
-            layer_norm_first: bool=False,
-            deep_norm: bool=False,
-            has_relative_attention_bias: bool=False,
-            num_buckets: int=0,
-            max_distance: int=0,
-            rescale_init: bool=False,
-            gru_rel_pos: bool=False,
-            encoder_layers: int=0, ) -> None:
-        super().__init__()
-        self.embedding_dim = embedding_dim
-        self.dropout = dropout
-        self.activation_dropout = activation_dropout
-        self.activation_name = activation_fn
-        self.activation_fn = get_activation_fn(activation_fn)
-        self.self_attn = MultiheadAttention(
-            self.embedding_dim,
-            num_attention_heads,
-            dropout=attention_dropout,
-            self_attention=True,
-            has_relative_attention_bias=has_relative_attention_bias,
-            num_buckets=num_buckets,
-            max_distance=max_distance,
-            rescale_init=rescale_init,
-            gru_rel_pos=gru_rel_pos, )
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(self.activation_dropout)
-        self.dropout3 = nn.Dropout(dropout)
-        self.layer_norm_first = layer_norm_first
-        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
-        if self.activation_name == "glu":
-            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim,
-                                  "swish")
-        else:
-            self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
-        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
-        self.final_layer_norm = LayerNorm(self.embedding_dim)
-        self.deep_norm = deep_norm
-        if self.deep_norm:
-            self.deep_norm_alpha = math.pow(2 * encoder_layers, 1 / 4)
-        else:
-            self.deep_norm_alpha = 1
-    def forward(self,
-                x: torch.Tensor,
-                self_attn_mask: torch.Tensor=None,
-                self_attn_padding_mask: torch.Tensor=None,
-                need_weights: bool=False,
-                pos_bias=None):
-        residual = x
-        if self.layer_norm_first:
-            x = self.self_attn_layer_norm(x)
-            x, attn, pos_bias = self.self_attn(
-                query=x,
-                key=x,
-                value=x,
-                key_padding_mask=self_attn_padding_mask,
-                need_weights=False,
-                attn_mask=self_attn_mask,
-                position_bias=pos_bias)
-            x = self.dropout1(x)
-            x = residual + x
-            residual = x
-            x = self.final_layer_norm(x)
-            if self.activation_name == "glu":
-                x = self.fc1(x)
-            else:
-                x = self.activation_fn(self.fc1(x))
-            x = self.dropout2(x)
-            x = self.fc2(x)
-            x = self.dropout3(x)
-            x = residual + x
-        else:
-            x, attn, pos_bias = self.self_attn(
-                query=x,
-                key=x,
-                value=x,
-                key_padding_mask=self_attn_padding_mask,
-                need_weights=need_weights,
-                attn_mask=self_attn_mask,
-                position_bias=pos_bias)
-            x = self.dropout1(x)
-            x = residual * self.deep_norm_alpha + x
-            x = self.self_attn_layer_norm(x)
-            residual = x
-            if self.activation_name == "glu":
-                x = self.fc1(x)
-            else:
-                x = self.activation_fn(self.fc1(x))
-            x = self.dropout2(x)
-            x = self.fc2(x)
-            x = self.dropout3(x)
-            x = residual * self.deep_norm_alpha + x
-            x = self.final_layer_norm(x)
-        return x, attn, pos_bias
-class MultiheadAttention(nn.Module):
-    """Multi-headed attention.
-    See "Attention Is All You Need" for more details.
-    """
-    def __init__(
-            self,
-            embed_dim,
-            num_heads,
-            kdim=None,
-            vdim=None,
-            dropout=0.0,
-            bias=True,
-            add_bias_kv=False,
-            add_zero_attn=False,
-            self_attention=False,
-            encoder_decoder_attention=False,
-            q_noise=0.0,
-            qn_block_size=8,
-            has_relative_attention_bias=False,
-            num_buckets=32,
-            max_distance=128,
-            gru_rel_pos=False,
-            rescale_init=False, ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.kdim = kdim if kdim is not None else embed_dim
-        self.vdim = vdim if vdim is not None else embed_dim
-        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
-        self.num_heads = num_heads
-        self.dropout_module = nn.Dropout(dropout)
-        self.has_relative_attention_bias = has_relative_attention_bias
-        self.num_buckets = num_buckets
-        self.max_distance = max_distance
-        if self.has_relative_attention_bias:
-            self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
-        self.head_dim = embed_dim // num_heads
-        self.q_head_dim = self.head_dim
-        self.k_head_dim = self.head_dim
-        assert (self.head_dim * num_heads == self.embed_dim
-                ), "embed_dim must be divisible by num_heads"
-        self.scaling = self.head_dim**-0.5
-        self.self_attention = self_attention
-        self.encoder_decoder_attention = encoder_decoder_attention
-        assert not self.self_attention or self.qkv_same_dim, (
-            "Self-attention requires query, key and "
-            "value to be of the same size")
-        k_bias = True
-        if rescale_init:
-            k_bias = False
-        k_embed_dim = embed_dim
-        q_embed_dim = embed_dim
-        self.k_proj = quant_noise(
-            nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise,
-            qn_block_size)
-        self.v_proj = quant_noise(
-            nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size)
-        self.q_proj = quant_noise(
-            nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise,
-            qn_block_size)
-        self.out_proj = quant_noise(
-            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)
-        if add_bias_kv:
-            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
-            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
-        else:
-            self.bias_k = self.bias_v = None
-        self.add_zero_attn = add_zero_attn
-        self.gru_rel_pos = gru_rel_pos
-        if self.gru_rel_pos:
-            self.grep_linear = nn.Linear(self.q_head_dim, 8)
-            self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1))
-        self.reset_parameters()
-    def reset_parameters(self):
-        if self.qkv_same_dim:
-            # Empirically observed the convergence to be much better with
-            # the scaled initialization
-            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
-            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
-            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
-        else:
-            nn.init.xavier_uniform_(self.k_proj.weight)
-            nn.init.xavier_uniform_(self.v_proj.weight)
-            nn.init.xavier_uniform_(self.q_proj.weight)
-        nn.init.xavier_uniform_(self.out_proj.weight)
-        if self.out_proj.bias is not None:
-            nn.init.constant_(self.out_proj.bias, 0.0)
-        if self.bias_k is not None:
-            nn.init.xavier_normal_(self.bias_k)
-        if self.bias_v is not None:
-            nn.init.xavier_normal_(self.bias_v)
-        if self.has_relative_attention_bias:
-            nn.init.xavier_normal_(self.relative_attention_bias.weight)
-    def _relative_positions_bucket(self, relative_positions,
-                                   bidirectional=True):
-        num_buckets = self.num_buckets
-        max_distance = self.max_distance
-        relative_buckets = 0
-        if bidirectional:
-            num_buckets = num_buckets // 2
-            relative_buckets += (
-                relative_positions > 0).to(torch.long) * num_buckets
-            relative_positions = torch.abs(relative_positions)
-        else:
-            relative_positions = -torch.min(
-                relative_positions, torch.zeros_like(relative_positions))
-        max_exact = num_buckets // 2
-        is_small = relative_positions < max_exact
-        relative_postion_if_large = max_exact + (
-            torch.log(relative_positions.float() / max_exact) / math.log(
-                max_distance / max_exact) *
-            (num_buckets - max_exact)).to(torch.long)
-        relative_postion_if_large = torch.min(
-            relative_postion_if_large,
-            torch.full_like(relative_postion_if_large, num_buckets - 1))
-        relative_buckets += torch.where(is_small, relative_positions,
-                                        relative_postion_if_large)
-        return relative_buckets
-    def compute_bias(self, query_length, key_length):
-        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
-        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
-        relative_position = memory_position - context_position
-        relative_position_bucket = self._relative_positions_bucket(
-            relative_position, bidirectional=True)
-        relative_position_bucket = relative_position_bucket.to(
-            self.relative_attention_bias.weight.device)
-        values = self.relative_attention_bias(relative_position_bucket)
-        values = values.permute([2, 0, 1])
-        return values
-    def forward(self,
-                query,
-                key: Optional[Tensor],
-                value: Optional[Tensor],
-                key_padding_mask: Optional[Tensor]=None,
-                incremental_state: Optional[Dict[str, Dict[str, Optional[
-                    Tensor]]]]=None,
-                need_weights: bool=True,
-                static_kv: bool=False,
-                attn_mask: Optional[Tensor]=None,
-                before_softmax: bool=False,
-                need_head_weights: bool=False,
-                position_bias: Optional[Tensor]=None
-                ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
-        """Input shape: Time x Batch x Channel
-        Args:
-            key_padding_mask (ByteTensor, optional): mask to exclude
-                keys that are pads, of shape `(batch, src_len)`, where
-                padding elements are indicated by 1s.
-            need_weights (bool, optional): return the attention weights,
-                averaged over heads (default: False).
-            attn_mask (ByteTensor, optional): typically used to
-                implement causal attention, where the mask prevents the
-                attention from looking forward in time (default: None).
-            before_softmax (bool, optional): return the raw attention
-                weights and values before the attention softmax.
-            need_head_weights (bool, optional): return the attention
-                weights for each head. Implies *need_weights*. Default:
-                return the average attention weights over all heads.
-        """
-        if need_head_weights:
-            need_weights = True
-        is_tpu = query.device.type == "xla"
-        tgt_len, bsz, embed_dim = query.size()
-        src_len = tgt_len
-        assert embed_dim == self.embed_dim
-        assert list(query.size()) == [tgt_len, bsz, embed_dim]
-        if key is not None:
-            src_len, key_bsz, _ = key.size()
-            if not torch.jit.is_scripting():
-                assert key_bsz == bsz
-                assert value is not None
-                assert src_len, bsz == value.shape[:2]
-        if self.has_relative_attention_bias and position_bias is None:
-            position_bias = self.compute_bias(tgt_len, src_len)
-            position_bias = position_bias.unsqueeze(0).repeat(
-                bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, src_len)
-        if incremental_state is not None:
-            saved_state = self._get_input_buffer(incremental_state)
-            if saved_state is not None and "prev_key" in saved_state:
-                # previous time steps are cached - no need to recompute
-                # key and value if they are static
-                if static_kv:
-                    assert self.encoder_decoder_attention and not self.self_attention
-                    key = value = None
-        else:
-            saved_state = None
-        if self.self_attention:
-            q = self.q_proj(query)
-            k = self.k_proj(query)
-            v = self.v_proj(query)
-        elif self.encoder_decoder_attention:
-            # encoder-decoder attention
-            q = self.q_proj(query)
-            if key is None:
-                assert value is None
-                k = v = None
-            else:
-                k = self.k_proj(key)
-                v = self.v_proj(key)
-        else:
-            assert key is not None and value is not None
-            q = self.q_proj(query)
-            k = self.k_proj(key)
-            v = self.v_proj(value)
-        q *= self.scaling
-        alpha = 32
-        q *= 1 / alpha
-        if self.bias_k is not None:
-            assert self.bias_v is not None
-            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
-            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
-            if attn_mask is not None:
-                attn_mask = torch.cat(
-                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
-                    dim=1)
-            if key_padding_mask is not None:
-                key_padding_mask = torch.cat(
-                    [
-                        key_padding_mask,
-                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
-                    ],
-                    dim=1, )
-        q = (q.contiguous().view(tgt_len, bsz * self.num_heads, self.q_head_dim)
-             .transpose(0, 1))
-        if k is not None:
-            k = (k.contiguous().view(-1, bsz * self.num_heads, self.k_head_dim)
-                 .transpose(0, 1))
-        if v is not None:
-            v = (v.contiguous().view(-1, bsz * self.num_heads, self.head_dim)
-                 .transpose(0, 1))
-        if saved_state is not None:
-            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
-            if "prev_key" in saved_state:
-                _prev_key = saved_state["prev_key"]
-                assert _prev_key is not None
-                prev_key = _prev_key.view(bsz * self.num_heads, -1,
-                                          self.head_dim)
-                if static_kv:
-                    k = prev_key
-                else:
-                    assert k is not None
-                    k = torch.cat([prev_key, k], dim=1)
-                src_len = k.size(1)
-            if "prev_value" in saved_state:
-                _prev_value = saved_state["prev_value"]
-                assert _prev_value is not None
-                prev_value = _prev_value.view(bsz * self.num_heads, -1,
-                                              self.head_dim)
-                if static_kv:
-                    v = prev_value
-                else:
-                    assert v is not None
-                    v = torch.cat([prev_value, v], dim=1)
-            prev_key_padding_mask: Optional[Tensor] = None
-            if "prev_key_padding_mask" in saved_state:
-                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
-            assert k is not None and v is not None
-            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
-                key_padding_mask=key_padding_mask,
-                prev_key_padding_mask=prev_key_padding_mask,
-                batch_size=bsz,
-                src_len=k.size(1),
-                static_kv=static_kv, )
-            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1,
-                                             self.head_dim)
-            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1,
-                                               self.head_dim)
-            saved_state["prev_key_padding_mask"] = key_padding_mask
-            # In this branch incremental_state is never None
-            assert incremental_state is not None
-            incremental_state = self._set_input_buffer(incremental_state,
-                                                       saved_state)
-        assert k is not None
-        assert k.size(1) == src_len
-        # This is part of a workaround to get around fork/join parallelism
-        # not supporting Optional types.
-        if key_padding_mask is not None and key_padding_mask.dim() == 0:
-            key_padding_mask = None
-        if key_padding_mask is not None:
-            assert key_padding_mask.size(0) == bsz
-            assert key_padding_mask.size(1) == src_len
-        if self.add_zero_attn:
-            assert v is not None
-            src_len += 1
-            k = torch.cat(
-                [k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
-            v = torch.cat(
-                [v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
-            if attn_mask is not None:
-                attn_mask = torch.cat(
-                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
-                    dim=1)
-            if key_padding_mask is not None:
-                key_padding_mask = torch.cat(
-                    [
-                        key_padding_mask,
-                        torch.zeros(key_padding_mask.size(0),
-                                    1).type_as(key_padding_mask),
-                    ],
-                    dim=1, )
-        attn_weights = torch.bmm(q, k.transpose(1, 2))
-        attn_weights = (
-            attn_weights - attn_weights.max(dim=-1, keepdim=True)[0]) * alpha
-        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len,
-                                              bsz)
-        assert list(
-            attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
-        if attn_mask is not None:
-            attn_mask = attn_mask.unsqueeze(0)
-            attn_weights += attn_mask
-        if key_padding_mask is not None:
-            # don't attend to padding symbols
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
-                                             src_len)
-            if not is_tpu:
-                attn_weights = attn_weights.masked_fill(
-                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
-                    float("-inf"), )
-            else:
-                attn_weights = attn_weights.transpose(0, 2)
-                attn_weights = attn_weights.masked_fill(key_padding_mask,
-                                                        float("-inf"))
-                attn_weights = attn_weights.transpose(0, 2)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
-                                             src_len)
-        if before_softmax:
-            return attn_weights, v, position_bias
-        if position_bias is not None:
-            attn_mask_rel_pos = position_bias
-            if self.gru_rel_pos == 1:
-                query_layer = q.view(bsz, self.num_heads, tgt_len,
-                                     self.q_head_dim) * alpha / self.scaling
-                _B, _H, _L, __ = query_layer.size()
-                gate_a, gate_b = torch.sigmoid(
-                    self.grep_linear(query_layer).view(_B, _H, _L, 2, 4).sum(
-                        -1, keepdim=False)).chunk(
-                            2, dim=-1)
-                gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
-                attn_mask_rel_pos = gate_a_1.view(bsz * self.num_heads, tgt_len,
-                                                  1) * position_bias
-            attn_mask_rel_pos = attn_mask_rel_pos.view(attn_weights.size())
-            attn_weights = attn_weights + attn_mask_rel_pos
-        attn_weights_float = F.softmax(attn_weights, dim=-1)
-        attn_weights = attn_weights_float.type_as(attn_weights)
-        attn_probs = self.dropout_module(attn_weights)
-        assert v is not None
-        attn = torch.bmm(attn_probs, v)
-        assert list(
-            attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
-        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
-        attn = self.out_proj(attn)
-        attn_weights: Optional[Tensor] = None
-        if need_weights:
-            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len,
-                                                   src_len).transpose(1, 0)
-            if not need_head_weights:
-                # average attention weights over heads
-                attn_weights = attn_weights.mean(dim=0)
-        return attn, attn_weights, position_bias
-    @staticmethod
-    def _append_prev_key_padding_mask(
-            key_padding_mask: Optional[Tensor],
-            prev_key_padding_mask: Optional[Tensor],
-            batch_size: int,
-            src_len: int,
-            static_kv: bool, ) -> Optional[Tensor]:
-        # saved key padding masks have shape (bsz, seq_len)
-        if prev_key_padding_mask is not None and static_kv:
-            new_key_padding_mask = prev_key_padding_mask
-        elif prev_key_padding_mask is not None and key_padding_mask is not None:
-            new_key_padding_mask = torch.cat(
-                [prev_key_padding_mask.float(), key_padding_mask.float()],
-                dim=1)
-        # During incremental decoding, as the padding token enters and
-        # leaves the frame, there will be a time when prev or current
-        # is None
-        elif prev_key_padding_mask is not None:
-            if src_len > prev_key_padding_mask.size(1):
-                filler = torch.zeros(
-                    (batch_size, src_len - prev_key_padding_mask.size(1)),
-                    device=prev_key_padding_mask.device, )
-                new_key_padding_mask = torch.cat(
-                    [prev_key_padding_mask.float(), filler.float()], dim=1)
-            else:
-                new_key_padding_mask = prev_key_padding_mask.float()
-        elif key_padding_mask is not None:
-            if src_len > key_padding_mask.size(1):
-                filler = torch.zeros(
-                    (batch_size, src_len - key_padding_mask.size(1)),
-                    device=key_padding_mask.device, )
-                new_key_padding_mask = torch.cat(
-                    [filler.float(), key_padding_mask.float()], dim=1)
-            else:
-                new_key_padding_mask = key_padding_mask.float()
-        else:
-            new_key_padding_mask = prev_key_padding_mask
-        return new_key_padding_mask
-    def _get_input_buffer(
-            self,
-            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
-    ) -> Dict[str, Optional[Tensor]]:
-        result = self.get_incremental_state(incremental_state, "attn_state")
-        if result is not None:
-            return result
-        else:
-            empty_result: Dict[str, Optional[Tensor]] = {}
-            return empty_result
-    def _set_input_buffer(
-            self,
-            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-            buffer: Dict[str, Optional[Tensor]], ):
-        return self.set_incremental_state(incremental_state, "attn_state",
-                                          buffer)
-    def apply_sparse_mask(self,
-                          attn_weights,
-                          tgt_len: int,
-                          src_len: int,
-                          bsz: int):
-        return attn_weights
-def init_bert_params(module):
-    """
-    Initialize the weights specific to the BERT Model.
-    This overrides the default initializations depending on the specified arguments.
-        1. If normal_init_linear_weights is set then weights of linear
-           layer will be initialized using the normal distribution and
-           bais will be set to the specified value.
-        2. If normal_init_embed_weights is set then weights of embedding
-           layer will be initialized using the normal distribution.
-        3. If normal_init_proj_weights is set then weights of
-           in_project_weight for MultiHeadAttention initialized using
-           the normal distribution (to be validated).
-    """
-    def normal_(data):
-        # with FSDP, module params will be on CUDA, so we cast them back to CPU
-        # so that the RNG is consistent with and without FSDP
-        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
-    if isinstance(module, nn.Linear):
-        normal_(module.weight.data)
-        if module.bias is not None:
-            module.bias.data.zero_()
-    if isinstance(module, nn.Embedding):
-        normal_(module.weight.data)
-        if module.padding_idx is not None:
-            module.weight.data[module.padding_idx].zero_()
-    if isinstance(module, MultiheadAttention):
-        normal_(module.q_proj.weight.data)
-        normal_(module.k_proj.weight.data)
-        normal_(module.v_proj.weight.data)

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/config.py DELETED Viewed

@@ -1,19 +0,0 @@
-import json
-import os
-# 获取当前脚本的所在目录
-script_dir = os.path.dirname(os.path.abspath(__file__))
-# JSON 文件的文件名
-json_filename = "ontology.json"
-# 构建 JSON 文件的完整路径
-json_path = os.path.join(script_dir, json_filename)
-id_name_dict = {}
-with open(json_path, 'r') as f:
-    json_items = json.load(f)
-# '/m/0dgw9r' -> 'Human sounds' and etc.
-for item in json_items:
-    id_name_dict[item['id']] = item['name']

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/modules.py DELETED Viewed

@@ -1,220 +0,0 @@
-# --------------------------------------------------------
-# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
-# Github source: https://github.com/microsoft/unilm/tree/master/beats
-# Copyright (c) 2022 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Based on fairseq code bases
-# https://github.com/pytorch/fairseq
-# --------------------------------------------------------
-import math
-import warnings
-import torch
-import torch.nn.functional as F
-from torch import nn
-class GradMultiply(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, scale):
-        ctx.scale = scale
-        res = x.new(x)
-        return res
-    @staticmethod
-    def backward(ctx, grad):
-        return grad * ctx.scale, None
-class SamePad(nn.Module):
-    def __init__(self, kernel_size, causal=False):
-        super().__init__()
-        if causal:
-            self.remove = kernel_size - 1
-        else:
-            self.remove = 1 if kernel_size % 2 == 0 else 0
-    def forward(self, x):
-        if self.remove > 0:
-            x = x[:, :, :-self.remove]
-        return x
-class Swish(nn.Module):
-    def __init__(self):
-        super(Swish, self).__init__()
-        self.act = torch.nn.Sigmoid()
-    def forward(self, x):
-        return x * self.act(x)
-class GLU_Linear(nn.Module):
-    def __init__(self,
-                 input_dim,
-                 output_dim,
-                 glu_type="sigmoid",
-                 bias_in_glu=True):
-        super(GLU_Linear, self).__init__()
-        self.glu_type = glu_type
-        self.output_dim = output_dim
-        if glu_type == "sigmoid":
-            self.glu_act = torch.nn.Sigmoid()
-        elif glu_type == "swish":
-            self.glu_act = Swish()
-        elif glu_type == "relu":
-            self.glu_act = torch.nn.ReLU()
-        elif glu_type == "gelu":
-            self.glu_act = torch.nn.GELU()
-        if bias_in_glu:
-            self.linear = nn.Linear(input_dim, output_dim * 2, True)
-        else:
-            self.linear = nn.Linear(input_dim, output_dim * 2, False)
-    def forward(self, x):
-        # to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case
-        x = self.linear(x)
-        if self.glu_type == "bilinear":
-            x = (x[:, :, 0:self.output_dim] *
-                 x[:, :, self.output_dim:self.output_dim * 2])
-        else:
-            x = (x[:, :, 0:self.output_dim] *
-                 self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))
-        return x
-def gelu_accurate(x):
-    if not hasattr(gelu_accurate, "_a"):
-        gelu_accurate._a = math.sqrt(2 / math.pi)
-    return (0.5 * x * (1 + torch.tanh(gelu_accurate._a *
-                                      (x + 0.044715 * torch.pow(x, 3)))))
-def gelu(x: torch.Tensor) -> torch.Tensor:
-    return torch.nn.functional.gelu(x.float()).type_as(x)
-def get_activation_fn(activation: str):
-    """Returns the activation function corresponding to `activation`"""
-    if activation == "relu":
-        return F.relu
-    elif activation == "gelu":
-        return gelu
-    elif activation == "gelu_fast":
-        warnings.warn(
-            "--activation-fn=gelu_fast has been renamed to gelu_accurate")
-        return gelu_accurate
-    elif activation == "gelu_accurate":
-        return gelu_accurate
-    elif activation == "tanh":
-        return torch.tanh
-    elif activation == "linear":
-        return lambda x: x
-    elif activation == "glu":
-        return lambda x: x
-    else:
-        raise RuntimeError(
-            "--activation-fn {} not supported".format(activation))
-def quant_noise(module, p, block_size):
-    """
-    Wraps modules and applies quantization noise to the weights for
-    subsequent quantization with Iterative Product Quantization as
-    described in "Training with Quantization Noise for Extreme Model Compression"
-    Args:
-        - module: nn.Module
-        - p: amount of Quantization Noise
-        - block_size: size of the blocks for subsequent quantization with iPQ
-    Remarks:
-        - Module weights must have the right sizes wrt the block size
-        - Only Linear, Embedding and Conv2d modules are supported for the moment
-        - For more detail on how to quantize by blocks with convolutional weights,
-          see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
-        - We implement the simplest form of noise here as stated in the paper
-          which consists in randomly dropping blocks
-    """
-    # if no quantization noise, don't register hook
-    if p <= 0:
-        return module
-    # supported modules
-    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
-    # test whether module.weight has the right sizes wrt block_size
-    is_conv = module.weight.ndim == 4
-    # 2D matrix
-    if not is_conv:
-        assert (
-            module.weight.size(1) %
-            block_size == 0), "Input features must be a multiple of block sizes"
-    # 4D matrix
-    else:
-        # 1x1 convolutions
-        if module.kernel_size == (1, 1):
-            assert (module.in_channels % block_size == 0
-                    ), "Input channels must be a multiple of block sizes"
-        # regular convolutions
-        else:
-            k = module.kernel_size[0] * module.kernel_size[1]
-            assert k % block_size == 0, "Kernel size must be a multiple of block size"
-    def _forward_pre_hook(mod, input):
-        # no noise for evaluation
-        if mod.training:
-            if not is_conv:
-                # gather weight and sizes
-                weight = mod.weight
-                in_features = weight.size(1)
-                out_features = weight.size(0)
-                # split weight matrix into blocks and randomly drop selected blocks
-                mask = torch.zeros(
-                    in_features // block_size * out_features,
-                    device=weight.device)
-                mask.bernoulli_(p)
-                mask = mask.repeat_interleave(block_size, -1).view(-1,
-                                                                   in_features)
-            else:
-                # gather weight and sizes
-                weight = mod.weight
-                in_channels = mod.in_channels
-                out_channels = mod.out_channels
-                # split weight matrix into blocks and randomly drop selected blocks
-                if mod.kernel_size == (1, 1):
-                    mask = torch.zeros(
-                        int(in_channels // block_size * out_channels),
-                        device=weight.device, )
-                    mask.bernoulli_(p)
-                    mask = mask.repeat_interleave(block_size, -1).view(
-                        -1, in_channels)
-                else:
-                    mask = torch.zeros(
-                        weight.size(0), weight.size(1), device=weight.device)
-                    mask.bernoulli_(p)
-                    mask = (
-                        mask.unsqueeze(2).unsqueeze(3)
-                        .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]))
-            # scale weights and apply mask
-            mask = mask.to(
-                torch.
-                bool)  # x.bool() is not currently supported in TorchScript
-            s = 1 / (1 - p)
-            mod.weight.data = s * weight.masked_fill(mask, 0)
-    module.register_forward_pre_hook(_forward_pre_hook)
-    return module

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/ontology.json DELETED Viewed

The diff for this file is too large to render. See raw diff

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/beats/quantizer.py DELETED Viewed

@@ -1,235 +0,0 @@
-# --------------------------------------------------------
-# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
-# Github source: https://github.com/microsoft/unilm/tree/master/beats
-# Copyright (c) 2022 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Based on VQGAN code bases
-# https://github.com/CompVis/taming-transformers
-# --------------------------------------------------------'
-import torch
-import torch.distributed as distributed
-import torch.nn as nn
-import torch.nn.functional as F
-try:
-    from einops import rearrange, repeat
-except ImportError:
-    pass
-def l2norm(t):
-    return F.normalize(t, p=2, dim=-1)
-def ema_inplace(moving_avg, new, decay):
-    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
-def sample_vectors(samples, num):
-    num_samples, device = samples.shape[0], samples.device
-    if num_samples >= num:
-        indices = torch.randperm(num_samples, device=device)[:num]
-    else:
-        indices = torch.randint(0, num_samples, (num, ), device=device)
-    return samples[indices]
-def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False):
-    dim, dtype, device = samples.shape[-1], samples.dtype, samples.device
-    means = sample_vectors(samples, num_clusters)
-    for _ in range(num_iters):
-        if use_cosine_sim:
-            dists = samples @ means.t()
-        else:
-            diffs = rearrange(samples, 'n d -> n () d') \
-                    - rearrange(means, 'c d -> () c d')
-            dists = -(diffs**2).sum(dim=-1)
-        buckets = dists.max(dim=-1).indices
-        bins = torch.bincount(buckets, minlength=num_clusters)
-        zero_mask = bins == 0
-        bins_min_clamped = bins.masked_fill(zero_mask, 1)
-        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
-        new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d=dim), samples)
-        new_means = new_means / bins_min_clamped[..., None]
-        if use_cosine_sim:
-            new_means = l2norm(new_means)
-        means = torch.where(zero_mask[..., None], means, new_means)
-    return means, bins
-class EmbeddingEMA(nn.Module):
-    def __init__(self,
-                 num_tokens,
-                 codebook_dim,
-                 decay=0.99,
-                 eps=1e-5,
-                 kmeans_init=True,
-                 codebook_init_path=''):
-        super().__init__()
-        self.num_tokens = num_tokens
-        self.codebook_dim = codebook_dim
-        self.decay = decay
-        self.eps = eps
-        if codebook_init_path == '':
-            if not kmeans_init:
-                weight = torch.randn(num_tokens, codebook_dim)
-                weight = l2norm(weight)
-            else:
-                weight = torch.zeros(num_tokens, codebook_dim)
-            self.register_buffer('initted', torch.Tensor([not kmeans_init]))
-        else:
-            print(f"load init codebook weight from {codebook_init_path}")
-            codebook_ckpt_weight = torch.load(
-                codebook_init_path, map_location='cpu')
-            weight = codebook_ckpt_weight.clone()
-            self.register_buffer('initted', torch.Tensor([True]))
-        self.weight = nn.Parameter(weight, requires_grad=False)
-        self.cluster_size = nn.Parameter(
-            torch.zeros(num_tokens), requires_grad=False)
-        self.embed_avg = nn.Parameter(weight.clone(), requires_grad=False)
-        # self.register_buffer('initted', torch.Tensor([not kmeans_init]))
-        self.update = True
-    @torch.jit.ignore
-    def init_embed_(self, data):
-        if self.initted:
-            return
-        print("Performing Kemans init for codebook")
-        embed, cluster_size = kmeans(
-            data, self.num_tokens, 10, use_cosine_sim=True)
-        self.weight.data.copy_(embed)
-        self.cluster_size.data.copy_(cluster_size)
-        self.initted.data.copy_(torch.Tensor([True]))
-    def forward(self, embed_id):
-        return F.embedding(embed_id, self.weight)
-    def cluster_size_ema_update(self, new_cluster_size):
-        self.cluster_size.data.mul_(self.decay).add_(
-            new_cluster_size, alpha=1 - self.decay)
-    def embed_avg_ema_update(self, new_embed_avg):
-        self.embed_avg.data.mul_(self.decay).add_(
-            new_embed_avg, alpha=1 - self.decay)
-    def weight_update(self, num_tokens):
-        n = self.cluster_size.sum()
-        smoothed_cluster_size = (
-            (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n)
-        # normalize embedding average with smoothed cluster size
-        embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1)
-        # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1))
-        self.weight.data.copy_(embed_normalized)
-def norm_ema_inplace(moving_avg, new, decay):
-    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
-    moving_avg.data.copy_(l2norm(moving_avg.data))
-class NormEMAVectorQuantizer(nn.Module):
-    def __init__(self,
-                 n_embed,
-                 embedding_dim,
-                 beta,
-                 decay=0.99,
-                 eps=1e-5,
-                 statistic_code_usage=True,
-                 kmeans_init=False,
-                 codebook_init_path=''):
-        super().__init__()
-        self.codebook_dim = embedding_dim
-        self.num_tokens = n_embed
-        self.beta = beta
-        self.decay = decay
-        # learnable = True if orthogonal_reg_weight > 0 else False
-        self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay,
-                                      eps, kmeans_init, codebook_init_path)
-        self.statistic_code_usage = statistic_code_usage
-        if statistic_code_usage:
-            self.register_buffer('cluster_size', torch.zeros(n_embed))
-        if distributed.is_available() and distributed.is_initialized():
-            print(
-                "ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!"
-            )
-            self.all_reduce_fn = distributed.all_reduce
-        else:
-            self.all_reduce_fn = nn.Identity()
-    def reset_cluster_size(self, device):
-        if self.statistic_code_usage:
-            self.register_buffer('cluster_size', torch.zeros(self.num_tokens))
-            self.cluster_size = self.cluster_size.to(device)
-    def forward(self, z):
-        # reshape z -> (batch, height, width, channel) and flatten
-        # z, 'b c h w -> b h w c'
-        # z = rearrange(z, 'b c h w -> b h w c')
-        # z = z.transpose(1, 2)
-        z = l2norm(z)
-        z_flattened = z.reshape(-1, self.codebook_dim)
-        self.embedding.init_embed_(z_flattened)
-        d = z_flattened.pow(2).sum(dim=1, keepdim=True) + \
-            self.embedding.weight.pow(2).sum(dim=1) - 2 * \
-            torch.einsum('bd,nd->bn', z_flattened, self.embedding.weight)  # 'n d -> d n'
-        encoding_indices = torch.argmin(d, dim=1)
-        z_q = self.embedding(encoding_indices).view(z.shape)
-        encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype)
-        if not self.training:
-            with torch.no_grad():
-                cluster_size = encodings.sum(0)
-                self.all_reduce_fn(cluster_size)
-                ema_inplace(self.cluster_size, cluster_size, self.decay)
-        if self.training and self.embedding.update:
-            # EMA cluster size
-            bins = encodings.sum(0)
-            self.all_reduce_fn(bins)
-            # self.embedding.cluster_size_ema_update(bins)
-            ema_inplace(self.cluster_size, bins, self.decay)
-            zero_mask = (bins == 0)
-            bins = bins.masked_fill(zero_mask, 1.)
-            embed_sum = z_flattened.t() @ encodings
-            self.all_reduce_fn(embed_sum)
-            embed_normalized = (embed_sum / bins.unsqueeze(0)).t()
-            embed_normalized = l2norm(embed_normalized)
-            embed_normalized = torch.where(
-                zero_mask[..., None], self.embedding.weight, embed_normalized)
-            norm_ema_inplace(self.embedding.weight, embed_normalized,
-                             self.decay)
-        # compute loss for embedding
-        loss = self.beta * F.mse_loss(z_q.detach(), z)
-        # preserve gradients
-        z_q = z + (z_q - z).detach()
-        # reshape back to match original input shape
-        # z_q, 'b h w c -> b c h w'
-        # z_q = rearrange(z_q, 'b h w c -> b c h w')
-        # z_q = z_q.transpose(1, 2)
-        return z_q, loss, encoding_indices

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/get_beats_librilight.py DELETED Viewed

@@ -1,321 +0,0 @@
-# Use AudioTag tool BEATs to filter out audios who's top1 tag is not 'speech'
-# non_speech.npy, 存储一个 python dict 表示非 speech 类型的音频的 tag, 更小，加载和搜索速度更快
-# audio_tag 目录存储 {utt_id}.txt, 第一行是小写的 top1 tag
-import argparse
-import os
-import time
-import traceback
-from concurrent.futures import ThreadPoolExecutor
-from pathlib import Path
-import librosa
-import numpy as np
-import torch
-import tqdm
-from AR.exps.beats.BEATs import BEATs
-from AR.exps.beats.BEATs import BEATsConfig
-from AR.exps.beats.config import id_name_dict
-from soundstorm.s2.exps.hubert.feature_utils import get_shard_range
-from soundstorm.utils import check_txt_file
-def get_BEATs_top1(wav,
-                   BEATs_model,
-                   BEATs_label_dict,
-                   device: str='cpu',
-                   topk: int=1):
-    wav = torch.tensor(wav).unsqueeze(0).to(device)
-    padding_mask = torch.zeros(wav.shape).bool().to(device)
-    probs = BEATs_model.extract_features(wav, padding_mask=padding_mask)[0]
-    # 单条推理
-    probs = probs[0]
-    topk_label_prob, topk_label_idx = probs.topk(k=topk)
-    topk_label = [
-        BEATs_label_dict[label_idx.item()] for label_idx in topk_label_idx
-    ]
-    topk_label_name = [id_name_dict[label] for label in topk_label]
-    top1_label = topk_label_name[0]
-    return top1_label
-def process_sentence(args,
-                     fp: Path,
-                     train_dump_dir: Path,
-                     dev_dump_dir: Path,
-                     test_dump_dir: Path,
-                     VAD_dict,
-                     BEATs_model,
-                     BEATs_label_dict,
-                     device: str='cpu'):
-    utt_id = fp.stem
-    sr = args.sr
-    record = []
-    train_audio_tag_dir = train_dump_dir / "audio_tag"
-    train_audio_tag_dir.mkdir(parents=True, exist_ok=True)
-    dev_audio_tag_dir = dev_dump_dir / "audio_tag"
-    dev_audio_tag_dir.mkdir(parents=True, exist_ok=True)
-    test_audio_tag_dir = test_dump_dir / "audio_tag"
-    test_audio_tag_dir.mkdir(parents=True, exist_ok=True)
-    try:
-        # get info for path
-        wav_path_list = str(fp).strip().split('/')
-        sub_dataset, spk_id, book_name = wav_path_list[-4], wav_path_list[
-            -3], wav_path_list[-2]
-        wav_name = wav_path_list[-1][:-5]
-        assert wav_name == utt_id
-        # key_name for big wav
-        key_name = f'{wav_name}#{sub_dataset}#{spk_id}#{book_name}'
-        # 判断 VAD 字典中不存在该条音频信息的情况
-        if key_name not in VAD_dict.keys():
-            print(key_name, 'not in VAD_dict !')
-            return record
-        wav = None
-        sorted_split_VAD_dict = sorted(VAD_dict[key_name].items())
-        len_dict = len(sorted_split_VAD_dict)
-        for index, item in enumerate(sorted_split_VAD_dict):
-            split_name, value = item
-            start, end = value
-            # train | dev | test
-            if index == len_dict - 1:
-                subset = 'test'
-                audio_tag_path = test_audio_tag_dir / (split_name + ".txt")
-            elif index == len_dict - 2:
-                subset = 'dev'
-                audio_tag_path = dev_audio_tag_dir / (split_name + ".txt")
-            else:
-                subset = 'train'
-                audio_tag_path = train_audio_tag_dir / (split_name + ".txt")
-            if os.path.exists(audio_tag_path) and check_txt_file(
-                    audio_tag_path):
-                # print(audio_tag_path, 'exits!')
-                pass
-            else:
-                # 这里加判断保证在 sub wav 的循环中只 load 一次
-                if wav is None:
-                    # load big wav
-                    # 在最外层 load 如果 sub wav 的特征都存在了就会白白消耗 load 的时间
-                    wav, _ = librosa.load(str(fp), sr=sr)
-                sub_wav = wav[int(start * sr):int(end * sr)]
-                audio_tag_top1 = get_BEATs_top1(
-                    wav=sub_wav,
-                    BEATs_model=BEATs_model,
-                    BEATs_label_dict=BEATs_label_dict,
-                    device=device)
-                with open(audio_tag_path, 'w') as f:
-                    f.write(audio_tag_top1)
-            sub_record = {
-                "utt_id": split_name,
-                "audio_tag_path": audio_tag_path,
-                "subset": subset
-            }
-            # recodrd 变成 List of Dict
-            record.append(sub_record)
-    except Exception:
-        print("occur Exception")
-        traceback.print_exc()
-        # record 有可能是一个不完整的 List
-        return record
-    return record
-def process_sentences(args,
-                      fps: Path,
-                      train_dump_dir: Path,
-                      dev_dump_dir: Path,
-                      test_dump_dir: Path,
-                      VAD_dict,
-                      BEATs_model,
-                      BEATs_label_dict,
-                      device: str='cpu',
-                      nprocs: int=1):
-    print("nprocs:", nprocs)
-    if nprocs == 1:
-        results = []
-        for fp in tqdm.tqdm(fps, total=len(fps)):
-            record = process_sentence(
-                args=args,
-                fp=fp,
-                train_dump_dir=train_dump_dir,
-                dev_dump_dir=dev_dump_dir,
-                test_dump_dir=test_dump_dir,
-                VAD_dict=VAD_dict,
-                BEATs_model=BEATs_model,
-                BEATs_label_dict=BEATs_label_dict,
-                device=device)
-            if record:
-                results.append(record)
-    else:
-        with ThreadPoolExecutor(nprocs) as pool:
-            futures = []
-            with tqdm.tqdm(total=len(fps)) as progress:
-                for fp in fps:
-                    future = pool.submit(process_sentence, args, fp,
-                                         train_dump_dir, dev_dump_dir,
-                                         test_dump_dir, VAD_dict, BEATs_model,
-                                         BEATs_label_dict, device)
-                    future.add_done_callback(lambda p: progress.update())
-                    futures.append(future)
-                results = []
-                for ft in futures:
-                    record = ft.result()
-                    if record:
-                        results.append(record)
-    # torch.save() to a large `.pth` file
-    non_speech_dict = dict()
-    non_speech_dict['train'] = {}
-    non_speech_dict['dev'] = {}
-    non_speech_dict['test'] = {}
-    # record 是 List of Dict, 一条大 wav 一个 record，一条小 wav 一个 sub_recored
-    print(f"start to save {args.rank}_{args.nshard}.npy ...")
-    save_start_time = time.time()
-    for record in tqdm.tqdm(results, total=len(results), colour='green'):
-        for sub_record in record:
-            # 这里加 try, 因为 txt 文件可能损坏
-            try:
-                utt_id = sub_record["utt_id"]
-                subset = sub_record["subset"]
-                audio_tag_top1 = check_txt_file(sub_record["audio_tag_path"])
-                if audio_tag_top1 is not False:
-                    if 'speech' not in audio_tag_top1.lower():
-                        non_speech_dict[subset][utt_id] = audio_tag_top1
-                    else:
-                        # print(f'audio tag result of {utt_id} is speech')
-                        pass
-                else:
-                    print(f'audio tag result of {utt_id} is False')
-            except Exception:
-                print(f"{utt_id} occur Exception")
-                traceback.print_exc()
-                continue
-    train_filename = train_dump_dir / f'non_speech_{args.rank}_{args.nshard}.npy'
-    dev_filename = dev_dump_dir / f'non_speech_{args.rank}_{args.nshard}.npy'
-    test_filename = test_dump_dir / f'non_speech_{args.rank}_{args.nshard}.npy'
-    np.save(train_filename, non_speech_dict['train'])
-    print(f"npy file '{train_filename}' write down")
-    np.save(dev_filename, non_speech_dict['dev'])
-    print(f"npy file '{dev_filename}' write down")
-    np.save(test_filename, non_speech_dict['test'])
-    print(f"npy file '{test_filename}' write down")
-    print('time of save stage:', time.time() - save_start_time)
-def main():
-    # parse config and args
-    parser = argparse.ArgumentParser(
-        description="Use AudioTag tool BEATs to filter out audios who's top1 tag is not 'speech'."
-    )
-    parser.add_argument(
-        "--data_dir", default=None, type=str, help="directory to dataset.")
-    parser.add_argument(
-        "--dump_dir",
-        type=str,
-        required=True,
-        help="directory to dump feature files.")
-    parser.add_argument(
-        "--num-cpu", type=int, default=1, help="number of process.")
-    parser.add_argument(
-        '--sr', type=int, default=16000, help='sample rate of model')
-    # For LibriLight dataset
-    parser.add_argument(
-        "--sub_dataset",
-        default="small",
-        type=str,
-        help="name of sub dataset of LibriLight",
-        choices=['small', 'medium', 'large', 'duplicate'], )
-    parser.add_argument(
-        "--VAD_path", type=str, default='./VAD/librilight_segment_dict.npy')
-    parser.add_argument("--nshard", type=int, default=3)
-    parser.add_argument("--rank", type=int, default=0)
-    # for BEATs
-    parser.add_argument(
-        "--BEATs_ckpt_path",
-        type=str,
-        default='./pretrained_model/BEATs_iter1_finetuned_on_AS2M_cpt1.pt')
-    args = parser.parse_args()
-    data_dir = Path(args.data_dir).expanduser()
-    dump_dir = Path(args.dump_dir).expanduser()
-    # use absolute path
-    dump_dir = dump_dir.resolve()
-    dump_dir.mkdir(parents=True, exist_ok=True)
-    assert data_dir.is_dir()
-    # sub_dataset here
-    sub_dataset_dir = data_dir / args.sub_dataset
-    # olny spk_id in list, sort by lexicographical order
-    speaker_list = sorted(os.listdir(sub_dataset_dir))
-    start, end = get_shard_range(len(speaker_list), args.nshard, args.rank)
-    # speaker_list for this rank
-    speaker_list = speaker_list[start:end]
-    all_wav_files = []
-    for speaker in speaker_list:
-        wav_files = sorted(list((sub_dataset_dir / speaker).rglob("*/*.flac")))
-        # filter out ._*.flac
-        wav_files = [
-            file for file in wav_files if not file.name.startswith('._')
-        ]
-        all_wav_files += wav_files
-    print(f"num of wav files in rank {args.rank}:", len(all_wav_files))
-    # get VAD info
-    VAD_dict = np.load(args.VAD_path, allow_pickle=True).item()
-    sub_dataset_dump_dir = dump_dir / args.sub_dataset
-    sub_dataset_dump_dir.mkdir(parents=True, exist_ok=True)
-    train_dump_dir = sub_dataset_dump_dir / "train"
-    train_dump_dir.mkdir(parents=True, exist_ok=True)
-    dev_dump_dir = sub_dataset_dump_dir / "dev"
-    dev_dump_dir.mkdir(parents=True, exist_ok=True)
-    test_dump_dir = sub_dataset_dump_dir / "test"
-    test_dump_dir.mkdir(parents=True, exist_ok=True)
-    BEATs_ckpt = torch.load(args.BEATs_ckpt_path)
-    BEATs_cfg = BEATsConfig(BEATs_ckpt['cfg'])
-    BEATs_model = BEATs(BEATs_cfg)
-    BEATs_model.load_state_dict(BEATs_ckpt['model'])
-    BEATs_model.eval()
-    # cpu or cuda
-    device = 'cpu'
-    BEATs_model.to(device)
-    BEATs_label_dict = BEATs_ckpt['label_dict']
-    # 每条大 wav 分出一个 dev 一个 test，比例大概是 96:2:2
-    if all_wav_files:
-        process_sentences(
-            args=args,
-            fps=all_wav_files,
-            train_dump_dir=train_dump_dir,
-            dev_dump_dir=dev_dump_dir,
-            test_dump_dir=test_dump_dir,
-            VAD_dict=VAD_dict,
-            BEATs_model=BEATs_model,
-            BEATs_label_dict=BEATs_label_dict,
-            device=device,
-            nprocs=args.num_cpu)
-if __name__ == "__main__":
-    main()

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/get_phones.py DELETED Viewed

@@ -1,232 +0,0 @@
-"""
-1. read text of dataset
-2. text -> IPA by GruutPhonemizer
-3. save out a *.npy dict for all text
-my_dict = {"utt_id1": text1, "utt_id2": text2}
-np.save(output_filename, my_dict)
-my_dict = np.load(output_filename, allow_pickle=True).item()
-"""
-import argparse
-import os
-from concurrent.futures import ThreadPoolExecutor
-from operator import itemgetter
-from pathlib import Path
-from typing import List
-import numpy as np
-import tqdm
-from AR.text_processing.phonemizer import GruutPhonemizer
-def read_txt(txt_file):
-    utt_name = txt_file.stem
-    utt_id = utt_name.split('.')[0]
-    try:
-        with open(txt_file, 'r') as file:
-            txt = file.readline()
-        record = {"utt_id": utt_id, "txt": txt}
-    except Exception:
-        print("occur Exception")
-        traceback.print_exc()
-        return None
-    return record
-def read_txts(txt_files: List[Path], nprocs: int=1):
-    if nprocs == 1:
-        results = []
-        for txt_file in tqdm.tqdm(txt_files, total=len(txt_files)):
-            record = read_txt(txt_file=txt_file)
-            if record:
-                results.append(record)
-    else:
-        with ThreadPoolExecutor(nprocs) as pool:
-            futures = []
-            with tqdm.tqdm(total=len(txt_files)) as progress:
-                for txt_file in txt_files:
-                    future = pool.submit(read_txt, txt_file)
-                    future.add_done_callback(lambda p: progress.update())
-                    futures.append(future)
-                results = []
-                for ft in futures:
-                    record = ft.result()
-                    if record:
-                        results.append(record)
-    results.sort(key=itemgetter("utt_id"))
-    return_list = []
-    for item in results:
-        return_list.append((item["utt_id"], item["txt"]))
-    return return_list
-def process_sentence(item, phonemizer):
-    utt_id, text = item
-    try:
-        phonemes = phonemizer.phonemize(text, espeak=False)
-        record = {"utt_id": utt_id, "phonemes": phonemes}
-    except Exception:
-        print("occur Exception")
-        traceback.print_exc()
-        return None
-    return record
-def process_sentences(items, phonemizer, output_dir, nprocs: int=1):
-    if nprocs == 1:
-        results = []
-        for item in tqdm.tqdm(items, total=len(items)):
-            record = process_sentence(item=item, phonemizer=phonemizer)
-            if record:
-                results.append(record)
-    else:
-        with ThreadPoolExecutor(nprocs) as pool:
-            futures = []
-            with tqdm.tqdm(total=len(items)) as progress:
-                for item in items:
-                    future = pool.submit(process_sentence, item, phonemizer)
-                    future.add_done_callback(lambda p: progress.update())
-                    futures.append(future)
-                results = []
-                for ft in futures:
-                    record = ft.result()
-                    if record:
-                        results.append(record)
-    results.sort(key=itemgetter("utt_id"))
-    npy_dict = {}
-    for item in results:
-        utt_id = item["utt_id"]
-        phonemes = item["phonemes"]
-        npy_dict[utt_id] = phonemes
-    filename = output_dir / 'phonemes.npy'
-    np.save(filename, npy_dict)
-    print(f"npy file '{filename}' write down")
-def main():
-    # parse config and args
-    parser = argparse.ArgumentParser(description="Get phones for datasets")
-    parser.add_argument(
-        "--dataset",
-        default="ljspeech",
-        type=str,
-        help="name of dataset, should in {ljspeech, libritts} now")
-    parser.add_argument(
-        "--data_dir", default=None, type=str, help="directory to dataset.")
-    parser.add_argument(
-        "--dump_dir",
-        type=str,
-        required=True,
-        help="directory to dump feature files.")
-    parser.add_argument(
-        "--num-cpu", type=int, default=1, help="number of process.")
-    args = parser.parse_args()
-    data_dir = Path(args.data_dir).expanduser()
-    dump_dir = Path(args.dump_dir).expanduser()
-    # use absolute path
-    dump_dir = dump_dir.resolve()
-    dump_dir.mkdir(parents=True, exist_ok=True)
-    assert data_dir.is_dir()
-    if args.dataset == "ljspeech":
-        data_dict = {}
-        text_path = data_dir / 'metadata.csv'
-        with open(text_path, 'r') as rf:
-            for line in rf:
-                line_list = line.strip().split('|')
-                utt_id = line_list[0]
-                raw_text = line_list[-1]
-                data_dict[utt_id] = raw_text
-        sorted_dict = sorted(data_dict.items())
-        num_train = 12900
-        num_dev = 100
-        # (utt_id, txt)
-        train_txts = sorted_dict[:num_train]
-        dev_txts = sorted_dict[num_train:num_train + num_dev]
-        test_txts = sorted_dict[num_train + num_dev:]
-    elif args.dataset == "libritts":
-        '''
-        we use train-clean-100、train-clean-360、train-other-500 here
-        and split dev and test from them, don't use test-* and dev-* cause the speakers are disjoint
-        the file structure is LibriTTS_R/train-clean-100/spkid/*/*.wav
-        there are about 2311 in these subsets, we split 1 dev and 1 test wav out from each speaker
-        '''
-        txt_files = []
-        train_txt_files = []
-        dev_txt_files = []
-        test_txt_files = []
-        sub_num_dev = 1
-        for sub_dataset_name in {
-                "train-clean-100", "train-clean-360", "train-other-500"
-        }:
-            sub_dataset_dir = data_dir / sub_dataset_name
-            # filter out hidden files
-            speaker_list = [
-                file for file in os.listdir(sub_dataset_dir)
-                if not file.startswith('.')
-            ]
-            for speaker in speaker_list:
-                txt_files = sorted(
-                    list((sub_dataset_dir / speaker).rglob(
-                        "*/*.normalized.txt")))
-                # filter out ._*.wav
-                txt_files = [
-                    file for file in txt_files if not file.name.startswith('._')
-                ]
-                train_txt_files += txt_files[:-sub_num_dev * 2]
-                dev_txt_files += txt_files[-sub_num_dev * 2:-sub_num_dev]
-                test_txt_files += txt_files[-sub_num_dev:]
-        print("len(train_txt_files):", len(train_txt_files))
-        print("len(dev_txt_files):", len(dev_txt_files))
-        print("len(test_txt_files):", len(test_txt_files))
-        train_txts = read_txts(train_txt_files)
-        dev_txts = read_txts(dev_txt_files)
-        test_txts = read_txts(test_txt_files)
-    else:
-        print("dataset should in {ljspeech, libritts} now!")
-    train_dump_dir = dump_dir / "train"
-    train_dump_dir.mkdir(parents=True, exist_ok=True)
-    dev_dump_dir = dump_dir / "dev"
-    dev_dump_dir.mkdir(parents=True, exist_ok=True)
-    test_dump_dir = dump_dir / "test"
-    test_dump_dir.mkdir(parents=True, exist_ok=True)
-    phonemizer = GruutPhonemizer(language='en-us')
-    # process for the 3 sections
-    if train_txts:
-        process_sentences(
-            items=train_txts,
-            output_dir=train_dump_dir,
-            phonemizer=phonemizer,
-            nprocs=args.num_cpu)
-    if dev_txts:
-        process_sentences(
-            items=dev_txts,
-            output_dir=dev_dump_dir,
-            phonemizer=phonemizer,
-            nprocs=args.num_cpu)
-    if test_txts:
-        process_sentences(
-            items=test_txts,
-            output_dir=test_dump_dir,
-            phonemizer=phonemizer,
-            nprocs=args.num_cpu)
-if __name__ == "__main__":
-    main()

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/get_phones_librilight.py DELETED Viewed

@@ -1,198 +0,0 @@
-"""
-1. read text of dataset, for LibriLight read txt_*.npy -> 需要整理成 list(utt_id, txt) 的形式
-2. text -> IPA by GruutPhonemizer
-3. save out a *.npy dict for all text
-4. LibriLight 每个 split 分开处理
-my_dict = {"utt_id1": text1, "utt_id2": text2}
-np.save(output_filename, my_dict)
-my_dict = np.load(output_filename, allow_pickle=True).item()
-"""
-import argparse
-import os
-import time
-import traceback
-from concurrent.futures import ThreadPoolExecutor
-from operator import itemgetter
-from pathlib import Path
-import numpy as np
-import tqdm
-from AR.text_processing.phonemizer import GruutPhonemizer
-from soundstorm.utils import check_txt_file
-def read_txts(txt_file: Path, nprocs: int=1):
-    '''
-    txt_file: path of npy dict, {"utt_id1": text1, "utt_id2": text2}
-    '''
-    txt_dict = np.load(txt_file, allow_pickle=True).item()
-    #[(utt_id, txt), ...]
-    return_list = list(txt_dict.items())
-    return return_list
-def process_sentence(item, phonemizer, output_dir):
-    utt_id, text = item
-    phonemes_dir = output_dir / "phonemes"
-    phonemes_dir.mkdir(parents=True, exist_ok=True)
-    phonemes_path = phonemes_dir / (utt_id + ".txt")
-    try:
-        if os.path.exists(phonemes_path) and check_txt_file(phonemes_path):
-            # print(phonemes_path, 'exits!')
-            pass
-        else:
-            phonemes = phonemizer.phonemize(text, espeak=False)
-            with open(phonemes_path, 'w') as f:
-                f.write(phonemes)
-        record = {"utt_id": utt_id, "phonemes_path": phonemes_path}
-    except Exception:
-        print("occur Exception")
-        traceback.print_exc()
-        return None
-    return record
-def process_sentences(args, items, phonemizer, output_dir, nprocs: int=1):
-    print("nprocs:", nprocs)
-    if nprocs == 1:
-        results = []
-        for item in tqdm.tqdm(items, total=len(items)):
-            record = process_sentence(
-                item=item, phonemizer=phonemizer, output_dir=output_dir)
-            if record:
-                results.append(record)
-    else:
-        with ThreadPoolExecutor(nprocs) as pool:
-            futures = []
-            with tqdm.tqdm(total=len(items)) as progress:
-                for item in items:
-                    future = pool.submit(process_sentence, item, phonemizer,
-                                         output_dir)
-                    future.add_done_callback(lambda p: progress.update())
-                    futures.append(future)
-                results = []
-                for ft in futures:
-                    record = ft.result()
-                    if record:
-                        results.append(record)
-    results.sort(key=itemgetter("utt_id"))
-    npy_dict = {}
-    print(f"start to save {args.rank}_{args.nshard}.npy ...")
-    save_start_time = time.time()
-    for item in tqdm.tqdm(results, total=len(results), colour='green'):
-        # 这里加 try, 因为 txt 文件可能损坏
-        try:
-            utt_id = item["utt_id"]
-            phonemes = check_txt_file(item["phonemes_path"])
-            if phonemes is not False:
-                npy_dict[utt_id] = phonemes
-            else:
-                print(f'phonemes of {utt_id} is False')
-        except Exception:
-            print(f"{utt_id} occur Exception")
-            traceback.print_exc()
-            continue
-    filename = output_dir / f'phonemes_{args.rank}_{args.nshard}.npy'
-    np.save(filename, npy_dict)
-    print(f"npy file '{filename}' write down")
-    print('time of save stage:', time.time() - save_start_time)
-def main():
-    # parse config and args
-    parser = argparse.ArgumentParser(
-        description="Get phones for LibriLight dataset from txt_*.npy")
-    parser.add_argument(
-        "--dump_dir",
-        type=str,
-        required=True,
-        help="directory to dump feature files.")
-    parser.add_argument(
-        "--num-cpu", type=int, default=1, help="number of process.")
-    parser.add_argument(
-        '--train_txt_dir',
-        type=str,
-        default='dump/small/train/',
-        help='dir of train txt files')
-    parser.add_argument(
-        '--dev_txt_dir',
-        type=str,
-        default='dump/small/dev/',
-        help='dir of dev txt files')
-    parser.add_argument(
-        '--test_txt_dir',
-        type=str,
-        default='dump/small/test/',
-        help='dir of test txt files')
-    parser.add_argument(
-        "--sub_dataset",
-        default="small",
-        type=str,
-        help="name of sub dataset of LibriLight",
-        choices=['small', 'medium', 'large', 'duplicate'], )
-    parser.add_argument("--nshard", type=int, default=3)
-    parser.add_argument("--rank", type=int, default=0)
-    args = parser.parse_args()
-    print(f"nshard: {args.nshard}, rank: {args.rank}")
-    train_txt_dir = Path(args.train_txt_dir)
-    dev_txt_dir = Path(args.dev_txt_dir)
-    test_txt_dir = Path(args.test_txt_dir)
-    dump_dir = Path(args.dump_dir).expanduser()
-    # use absolute path
-    dump_dir = dump_dir.resolve()
-    dump_dir.mkdir(parents=True, exist_ok=True)
-    train_txt_file = train_txt_dir / f'txt_{args.rank}_{args.nshard}.npy'
-    dev_txt_file = dev_txt_dir / f'txt_{args.rank}_{args.nshard}.npy'
-    test_txt_file = test_txt_dir / f'txt_{args.rank}_{args.nshard}.npy'
-    train_txts = read_txts(train_txt_file)
-    dev_txts = read_txts(dev_txt_file)
-    test_txts = read_txts(test_txt_file)
-    sub_dataset_dump_dir = dump_dir / args.sub_dataset
-    sub_dataset_dump_dir.mkdir(parents=True, exist_ok=True)
-    train_dump_dir = sub_dataset_dump_dir / "train"
-    train_dump_dir.mkdir(parents=True, exist_ok=True)
-    dev_dump_dir = sub_dataset_dump_dir / "dev"
-    dev_dump_dir.mkdir(parents=True, exist_ok=True)
-    test_dump_dir = sub_dataset_dump_dir / "test"
-    test_dump_dir.mkdir(parents=True, exist_ok=True)
-    phonemizer = GruutPhonemizer(language='en-us')
-    # process for the 3 sections
-    if train_txts:
-        process_sentences(
-            args=args,
-            items=train_txts,
-            output_dir=train_dump_dir,
-            phonemizer=phonemizer,
-            nprocs=args.num_cpu)
-    if dev_txts:
-        process_sentences(
-            args=args,
-            items=dev_txts,
-            output_dir=dev_dump_dir,
-            phonemizer=phonemizer,
-            nprocs=args.num_cpu)
-    if test_txts:
-        process_sentences(
-            args=args,
-            items=test_txts,
-            output_dir=test_dump_dir,
-            phonemizer=phonemizer,
-            nprocs=args.num_cpu)
-if __name__ == "__main__":
-    main()

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/get_txt_librilight.py DELETED Viewed

@@ -1,255 +0,0 @@
-import argparse
-import os
-import time
-import traceback
-from concurrent.futures import ThreadPoolExecutor
-from pathlib import Path
-import librosa
-import numpy as np
-import tqdm
-import whisper
-from soundstorm.s2.exps.hubert.feature_utils import get_shard_range
-from soundstorm.utils import check_txt_file
-def process_sentence(args,
-                     fp: Path,
-                     train_dump_dir: Path,
-                     dev_dump_dir: Path,
-                     test_dump_dir: Path,
-                     VAD_dict):
-    asr_model = whisper.load_model("tiny.en")
-    utt_id = fp.stem
-    sr = args.sr
-    record = []
-    train_txt_dir = train_dump_dir / "txt"
-    train_txt_dir.mkdir(parents=True, exist_ok=True)
-    dev_txt_dir = dev_dump_dir / "txt"
-    dev_txt_dir.mkdir(parents=True, exist_ok=True)
-    test_txt_dir = test_dump_dir / "txt"
-    test_txt_dir.mkdir(parents=True, exist_ok=True)
-    try:
-        # get info for path
-        wav_path_list = str(fp).strip().split('/')
-        sub_dataset, spk_id, book_name = wav_path_list[-4], wav_path_list[
-            -3], wav_path_list[-2]
-        wav_name = wav_path_list[-1][:-5]
-        assert wav_name == utt_id
-        # key_name for big wav
-        key_name = f'{wav_name}#{sub_dataset}#{spk_id}#{book_name}'
-        # 判断 VAD 字典中不存在该条音频信息的情况
-        if key_name not in VAD_dict.keys():
-            print(key_name, 'not in VAD_dict !')
-            return record
-        wav = None
-        sorted_split_VAD_dict = sorted(VAD_dict[key_name].items())
-        len_dict = len(sorted_split_VAD_dict)
-        for index, item in enumerate(sorted_split_VAD_dict):
-            split_name, value = item
-            start, end = value
-            # train | dev | test
-            if index == len_dict - 1:
-                subset = 'test'
-                txt_path = test_txt_dir / (split_name + ".txt")
-            elif index == len_dict - 2:
-                subset = 'dev'
-                txt_path = dev_txt_dir / (split_name + ".txt")
-            else:
-                subset = 'train'
-                txt_path = train_txt_dir / (split_name + ".txt")
-            if os.path.exists(txt_path) and check_txt_file(txt_path):
-                # print(txt_path, 'exits!')
-                pass
-            else:
-                # 这里加判断保证在 sub wav 的循环中只 load 一次
-                if wav is None:
-                    # load big wav
-                    # 在最外层 load 如果 sub wav 的特征都存在了就会白白消耗 load 的时间
-                    wav, _ = librosa.load(str(fp), sr=sr)
-                sub_wav = wav[int(start * sr):int(end * sr)]
-                asr_result = asr_model.transcribe(sub_wav)["text"]
-                with open(txt_path, 'w') as f:
-                    f.write(asr_result)
-            sub_record = {
-                "utt_id": split_name,
-                "txt_path": txt_path,
-                "subset": subset
-            }
-            # recodrd 变成 List of Dict
-            record.append(sub_record)
-    except Exception:
-        print("occur Exception")
-        traceback.print_exc()
-        # record 有可能是一个不完整的 List
-        return record
-    return record
-def process_sentences(args,
-                      fps: Path,
-                      train_dump_dir: Path,
-                      dev_dump_dir: Path,
-                      test_dump_dir: Path,
-                      VAD_dict,
-                      nprocs: int=1):
-    print("nprocs:", nprocs)
-    if nprocs == 1:
-        results = []
-        for fp in tqdm.tqdm(fps, total=len(fps)):
-            record = process_sentence(
-                args=args,
-                fp=fp,
-                train_dump_dir=train_dump_dir,
-                dev_dump_dir=dev_dump_dir,
-                test_dump_dir=test_dump_dir,
-                VAD_dict=VAD_dict)
-            if record:
-                results.append(record)
-    else:
-        with ThreadPoolExecutor(nprocs) as pool:
-            futures = []
-            with tqdm.tqdm(total=len(fps)) as progress:
-                for fp in fps:
-                    future = pool.submit(process_sentence, args, fp,
-                                         train_dump_dir, dev_dump_dir,
-                                         test_dump_dir, VAD_dict)
-                    future.add_done_callback(lambda p: progress.update())
-                    futures.append(future)
-                results = []
-                for ft in futures:
-                    record = ft.result()
-                    if record:
-                        results.append(record)
-    # torch.save() to a large `.pth` file
-    txt_dict = dict()
-    txt_dict['train'] = {}
-    txt_dict['dev'] = {}
-    txt_dict['test'] = {}
-    # record 是 List of Dict, 一条大 wav 一个 record，一条小 wav 一个 sub_recored
-    print(f"start to save {args.rank}_{args.nshard}.npy ...")
-    save_start_time = time.time()
-    for record in tqdm.tqdm(results, total=len(results), colour='green'):
-        for sub_record in record:
-            # 这里加 try, 因为 txt 文件可能损坏
-            try:
-                utt_id = sub_record["utt_id"]
-                subset = sub_record["subset"]
-                asr_result = check_txt_file(sub_record["txt_path"])
-                if asr_result is not False:
-                    txt_dict[subset][utt_id] = asr_result
-                else:
-                    print(f'asr result of {utt_id} is False')
-            except Exception:
-                print(f"{utt_id} occur Exception")
-                traceback.print_exc()
-                continue
-    train_filename = train_dump_dir / f'txt_{args.rank}_{args.nshard}.npy'
-    dev_filename = dev_dump_dir / f'txt_{args.rank}_{args.nshard}.npy'
-    test_filename = test_dump_dir / f'txt_{args.rank}_{args.nshard}.npy'
-    np.save(train_filename, txt_dict['train'])
-    print(f"npy file '{train_filename}' write down")
-    np.save(dev_filename, txt_dict['dev'])
-    print(f"npy file '{dev_filename}' write down")
-    np.save(test_filename, txt_dict['test'])
-    print(f"npy file '{test_filename}' write down")
-    print('time of save stage:', time.time() - save_start_time)
-def main():
-    # parse config and args
-    parser = argparse.ArgumentParser(
-        description="Preprocess audio and then extract features for LibriLight.")
-    parser.add_argument(
-        "--data_dir", default=None, type=str, help="directory to dataset.")
-    parser.add_argument(
-        "--dump_dir",
-        type=str,
-        required=True,
-        help="directory to dump feature files.")
-    parser.add_argument(
-        "--num-cpu", type=int, default=1, help="number of process.")
-    parser.add_argument(
-        '--sr', type=int, default=16000, help='sample rate of model')
-    # For LibriLight dataset
-    parser.add_argument(
-        "--sub_dataset",
-        default="small",
-        type=str,
-        help="name of sub dataset of LibriLight",
-        choices=['small', 'medium', 'large', 'duplicate'], )
-    parser.add_argument(
-        "--VAD_path", type=str, default='./VAD/librilight_segment_dict.npy')
-    parser.add_argument("--nshard", type=int, default=3)
-    parser.add_argument("--rank", type=int, default=0)
-    args = parser.parse_args()
-    data_dir = Path(args.data_dir).expanduser()
-    dump_dir = Path(args.dump_dir).expanduser()
-    # use absolute path
-    dump_dir = dump_dir.resolve()
-    dump_dir.mkdir(parents=True, exist_ok=True)
-    assert data_dir.is_dir()
-    # sub_dataset here
-    sub_dataset_dir = data_dir / args.sub_dataset
-    # olny spk_id in list, sort by lexicographical order
-    speaker_list = sorted(os.listdir(sub_dataset_dir))
-    start, end = get_shard_range(len(speaker_list), args.nshard, args.rank)
-    # speaker_list for this rank
-    speaker_list = speaker_list[start:end]
-    all_wav_files = []
-    for speaker in speaker_list:
-        wav_files = sorted(list((sub_dataset_dir / speaker).rglob("*/*.flac")))
-        # filter out ._*.flac
-        wav_files = [
-            file for file in wav_files if not file.name.startswith('._')
-        ]
-        all_wav_files += wav_files
-    print(f"num of wav files in rank {args.rank}:", len(all_wav_files))
-    # get VAD info
-    VAD_dict = np.load(args.VAD_path, allow_pickle=True).item()
-    sub_dataset_dump_dir = dump_dir / args.sub_dataset
-    sub_dataset_dump_dir.mkdir(parents=True, exist_ok=True)
-    train_dump_dir = sub_dataset_dump_dir / "train"
-    train_dump_dir.mkdir(parents=True, exist_ok=True)
-    dev_dump_dir = sub_dataset_dump_dir / "dev"
-    dev_dump_dir.mkdir(parents=True, exist_ok=True)
-    test_dump_dir = sub_dataset_dump_dir / "test"
-    test_dump_dir.mkdir(parents=True, exist_ok=True)
-    # 每条大 wav 分出一个 dev 一个 test，比例大概是 96:2:2
-    if all_wav_files:
-        process_sentences(
-            args=args,
-            fps=all_wav_files,
-            train_dump_dir=train_dump_dir,
-            dev_dump_dir=dev_dump_dir,
-            test_dump_dir=test_dump_dir,
-            VAD_dict=VAD_dict,
-            nprocs=args.num_cpu)
-if __name__ == "__main__":
-    main()

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/split_train_val.py DELETED Viewed

@@ -1,35 +0,0 @@
-import numpy
-import pandas
-semantic_path = 'dump/semantic.tsv'
-phoneme_path = 'dump/phoneme.npy'
-train_semantic_path = 'dump/semantic_train.tsv'
-train_phoneme_path = 'dump/phoneme_train.npy'
-dev_semantic_path = 'dump/semantic_dev.tsv'
-dev_phoneme_path = 'dump/phoneme_dev.npy'
-# 读取dump/semantic.tsv
-semantic_df = pandas.read_csv(semantic_path, sep='\t')
-# pd.DataFrame(columns=["item_name", "semantic_audio"])
-# # 读取dump/phoneme.npy
-phoneme_dict = numpy.load(phoneme_path, allow_pickle=True).item()
-dev_num = 20
-# 随机从semantic_df中选取dev_num个
-dev_df = semantic_df.sample(n=dev_num)
-# 剩下的是train
-train_df = semantic_df.drop(dev_df.index)
-# 保存
-dev_df.to_csv(dev_semantic_path, sep='\t', index=False)
-train_df.to_csv(train_semantic_path, sep='\t', index=False)
-# 将dev_df中的item_name取出来 作为dev_phoneme_dict的key
-dev_item_names = dev_df['item_name'].tolist()
-dev_phoneme_dict = {k: phoneme_dict[k] for k in dev_item_names if k in phoneme_dict}
-train_phoneme_dict = {k: phoneme_dict[k] for k in phoneme_dict.keys() if k not in dev_item_names}
-numpy.save(dev_phoneme_path, dev_phoneme_dict)
-numpy.save(train_phoneme_path, train_phoneme_dict)

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/t2s.py DELETED Viewed

@@ -1,197 +0,0 @@
-# text to semantic
-import argparse
-import os
-import re
-import time
-from pathlib import Path
-import librosa
-import numpy as np
-import torch
-import whisper
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
-from AR.text_processing.phonemizer import GruutPhonemizer
-from AR.utils.io import load_yaml_config
-def get_batch(text, phonemizer):
-    # phoneme_ids 和 phoneme_ids_len 是需要的
-    phoneme = phonemizer.phonemize(text, espeak=False)
-    phoneme_ids = phonemizer.transform(phoneme)
-    phoneme_ids_len = len(phoneme_ids)
-    phoneme_ids = np.array(phoneme_ids)
-    # add batch axis here
-    phoneme_ids = torch.tensor(phoneme_ids).unsqueeze(0)
-    phoneme_ids_len = torch.tensor([phoneme_ids_len])
-    print("phoneme:", phoneme)
-    batch = {
-        # torch.Tensor (B, max_phoneme_length)
-        "phoneme_ids": phoneme_ids,
-        # torch.Tensor (B)
-        "phoneme_ids_len": phoneme_ids_len
-    }
-    return batch
-def get_prompt(prompt_wav_path, asr_model, phonemizer, semantic_tokenizer):
-    sample_rate = 16000
-    # to get prompt
-    prompt_name = os.path.basename(prompt_wav_path).split('.')[0]
-    wav, _ = librosa.load(prompt_wav_path, sr=sample_rate)
-    # 取末尾 3s, 但是不包含最后 0.1s 防止 AR S1 infer 提前停止
-    wav = wav[-sample_rate * 3:-int(sample_rate * 0.1)]
-    # wav 需要挪出末尾的静音否则也可能提前停住
-    prompt_text = asr_model.transcribe(wav)["text"]
-    # 移除最后的句点, 防止 AR S1 infer 提前停止, 加了句点可能会有停顿
-    prompt_text = prompt_text.replace(".", "")
-    prompt_phoneme = phonemizer.phonemize(prompt_text, espeak=False)
-    prompt_phoneme_ids = phonemizer.transform(prompt_phoneme)
-    prompt_phoneme_ids_len = len(prompt_phoneme_ids)
-    # get prompt_semantic
-    # (T) -> (1, T)
-    wav = torch.tensor(wav).unsqueeze(0)
-    wav = wav.cuda()
-    # (1, T)
-    prompt_semantic_tokens = semantic_tokenizer.tokenize(wav).to(torch.int32)
-    prompt_phoneme_ids = torch.tensor(prompt_phoneme_ids).unsqueeze(0)
-    prompt_phoneme_ids_len = torch.tensor([prompt_phoneme_ids_len])
-    result = {
-        'prompt_name': prompt_name,
-        'prompt_phoneme_ids': prompt_phoneme_ids,
-        'prompt_semantic_tokens': prompt_semantic_tokens,
-        'prompt_phoneme_ids_len': prompt_phoneme_ids_len
-    }
-    return result
-def parse_args():
-    # parse args and config
-    parser = argparse.ArgumentParser(
-        description="Run SoundStorm AR S1 model for input text file")
-    parser.add_argument(
-        '--config_file',
-        type=str,
-        default='conf/default.yaml',
-        help='path of config file')
-    parser.add_argument(
-        "--text_file",
-        type=str,
-        help="text file to be convert to semantic tokens, a 'utt_id sentence' pair per line."
-    )
-    parser.add_argument(
-        '--ckpt_path',
-        type=str,
-        default='exp/default/ckpt/epoch=99-step=49000.ckpt',
-        help='Checkpoint file of SoundStorm AR S1 model.')
-    parser.add_argument(
-        '--prompt_wav_path',
-        type=str,
-        default=None,
-        help='extract prompt semantic and prompt phonemes from prompt wav')
-    # to get semantic tokens from prompt_wav
-    parser.add_argument("--hubert_path", type=str, default=None)
-    parser.add_argument("--quantizer_path", type=str, default=None)
-    parser.add_argument("--output_dir", type=str, help="output dir.")
-    args = parser.parse_args()
-    return args
-def main():
-    args = parse_args()
-    config = load_yaml_config(args.config_file)
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    hz = 50
-    max_sec = config['data']['max_sec']
-    # get models
-    t2s_model = Text2SemanticLightningModule.load_from_checkpoint(
-        checkpoint_path=args.ckpt_path, config=config)
-    t2s_model.cuda()
-    t2s_model.eval()
-    phonemizer: GruutPhonemizer = GruutPhonemizer(language='en-us')
-    # models for prompt
-    asr_model = whisper.load_model("tiny.en")
-    semantic_tokenizer = SemanticTokenizer(
-        hubert_path=args.hubert_path,
-        quantizer_path=args.quantizer_path,
-        duplicate=True)
-    prompt_result = get_prompt(
-        prompt_wav_path=args.prompt_wav_path,
-        asr_model=asr_model,
-        phonemizer=phonemizer,
-        semantic_tokenizer=semantic_tokenizer)
-    # zero prompt => 输出的 semantic 包含的内容是对的但是音色是乱的
-    # (B, 1)
-    # prompt = torch.ones(
-    #     batch['phoneme_ids'].size(0), 1, dtype=torch.int32) * 0
-    prompt = prompt_result['prompt_semantic_tokens']
-    prompt_phoneme_ids_len = prompt_result['prompt_phoneme_ids_len']
-    prompt_phoneme_ids = prompt_result['prompt_phoneme_ids']
-    sentences = []
-    with open(args.text_file, 'rt', encoding='utf-8') as f:
-        for line in f:
-            if line.strip() != "":
-                items = re.split(r"\s+", line.strip(), 1)
-                utt_id = items[0]
-                sentence = " ".join(items[1:])
-            sentences.append((utt_id, sentence))
-    semantic_data = [['item_name', 'semantic_audio']]
-    for utt_id, sentence in sentences[1:]:
-        # 需要自己构造伪 batch 输入给模型
-        batch = get_batch(sentence, phonemizer)
-        # prompt 和真正的输入拼接
-        all_phoneme_ids = torch.cat(
-            [prompt_phoneme_ids, batch['phoneme_ids']], dim=1)
-        # 或者可以直接求 all_phoneme_ids 的 shape[-1]
-        all_phoneme_len = prompt_phoneme_ids_len + batch['phoneme_ids_len']
-        st = time.time()
-        with torch.no_grad():
-            pred_semantic = t2s_model.model.infer(
-                all_phoneme_ids.cuda(),
-                all_phoneme_len.cuda(),
-                prompt.cuda(),
-                top_k=config['inference']['top_k'],
-                early_stop_num=hz * max_sec)
-        print(f'{time.time() - st} sec used in T2S')
-        # 删除 prompt 对应的部分
-        prompt_len = prompt.shape[-1]
-        pred_semantic = pred_semantic[:, prompt_len:]
-        # bs = 1
-        pred_semantic = pred_semantic[0]
-        semantic_token = pred_semantic.detach().cpu().numpy().tolist()
-        semantic_token_str = ' '.join(str(x) for x in semantic_token)
-        semantic_data.append([utt_id, semantic_token_str])
-        delimiter = '\t'
-        filename = output_dir / f'{utt_id}_p_{prompt_result["prompt_name"]}_semantic_token.tsv'
-        with open(filename, 'w', encoding='utf-8') as writer:
-            for row in semantic_data:
-                line = delimiter.join(row)
-                writer.write(line + '\n')
-        # clean semantic token for next setence
-        semantic_data = [['item_name', 'semantic_audio']]
-if __name__ == "__main__":
-    main()

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/test.py DELETED Viewed

@@ -1,139 +0,0 @@
-# test from dump file
-import argparse
-import time
-from pathlib import Path
-import numpy as np
-import torch
-from AR.data.dataset import Text2SemanticDataset
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
-from AR.utils.io import load_yaml_config
-from torch.utils.data import DataLoader
-def parse_args():
-    # parse args and config
-    parser = argparse.ArgumentParser(
-        description="Run SoundStorm AR S1 model for test set.")
-    parser.add_argument(
-        '--config_file',
-        type=str,
-        default='conf/default.yaml',
-        help='path of config file')
-    # args for dataset
-    parser.add_argument(
-        '--test_semantic_path',
-        type=str,
-        default='dump/test/semantic_token.tsv')
-    parser.add_argument(
-        '--test_phoneme_path', type=str, default='dump/test/phonemes.npy')
-    parser.add_argument(
-        '--ckpt_path',
-        type=str,
-        default='exp/default/ckpt/epoch=99-step=49000.ckpt',
-        help='Checkpoint file of SoundStorm AR S1 model.')
-    parser.add_argument("--output_dir", type=str, help="output dir.")
-    args = parser.parse_args()
-    return args
-def main():
-    args = parse_args()
-    config = load_yaml_config(args.config_file)
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    batch_size = 1
-    hz = 50
-    max_sec = config['data']['max_sec']
-    # get dataset
-    test_dataset = Text2SemanticDataset(
-        phoneme_path=args.test_phoneme_path,
-        semantic_path=args.test_semantic_path,
-        # max_sec 需要与训练时保持一致，不然可能会效果不好，重复漏字等
-        # 但是这里设置太短又会直接过滤掉太长的样本，为了防止被过滤掉，可以在 infer 的时候截断
-        max_sec=100,
-        max_sample=8,
-        pad_val=config['data']['pad_val'])
-    # get model
-    t2s_model = Text2SemanticLightningModule.load_from_checkpoint(
-        checkpoint_path=args.ckpt_path, config=config)
-    t2s_model.cuda()
-    t2s_model.eval()
-    # 获取 batch_size 条
-    # 创建 DataLoader，并指定 collate_fn 函数
-    dataloader = DataLoader(
-        test_dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        collate_fn=test_dataset.collate)
-    item_names = test_dataset.__get_item_names__()
-    # 逐批次读取数据, bs=1、shuffle=False 时可以用 __get_item_names__ 对应
-    semantic_data = [['item_name', 'semantic_audio']]
-    for i, batch in enumerate(dataloader):
-        # 要保证 bs = 1
-        utt_id = item_names[i]
-        if i == 0:
-            print("utt_id:", utt_id)
-            # bs > 1 时会补零
-            # 与 validation_step() 保持一致
-            semantic_len = batch['semantic_ids'].size(1)
-            # 以 batch['semantic_ids'] 的前 150 个为 prompt
-            # 多次合成，前 prompt_len 个是一样的，而且和 prompt 一样
-            prompt_len = min(int(semantic_len * 0.5), 150)
-            # 输入纯文本时 prompt 该输入什么？=> see t2s.py
-            prompt = batch['semantic_ids'][:, :prompt_len]
-            # # zero prompt => 也可以输出文本内容正确的 semantic token, 但是音色是乱的
-            # 证明 semantic token 中还是包含了音色信息
-            # prompt = torch.ones(
-            #     batch['semantic_ids'].size(0), 1, dtype=torch.int32) * 0
-            # print("prompt:", prompt)
-            # print("prompt.shape:", prompt.shape)
-            np.save(output_dir / 'prompt.npy', prompt.detach().cpu().numpy())
-            st = time.time()
-            with torch.no_grad():
-                # calculate acc for test
-                loss, acc = t2s_model.model.forward(
-                    batch['phoneme_ids'].cuda(),
-                    batch['phoneme_ids_len'].cuda(),
-                    batch['semantic_ids'].cuda(),
-                    batch['semantic_ids_len'].cuda())
-                print("top_3_acc of this batch:", acc)
-                pred_semantic = t2s_model.model.infer(
-                    batch['phoneme_ids'].cuda(),
-                    batch['phoneme_ids_len'].cuda(),
-                    prompt.cuda(),
-                    top_k=config['inference']['top_k'],
-                    # hz * max_sec in train dataloader
-                    # 生成的长度是 1002 应该是有一些 pad
-                    early_stop_num=hz * max_sec)
-                # bs = 1
-                pred_semantic = pred_semantic[0]
-            print(f'{time.time() - st} sec used in T2S')
-            semantic_token = pred_semantic.detach().cpu().numpy().tolist()
-            semantic_token_str = ' '.join(str(x) for x in semantic_token)
-            semantic_data.append([utt_id, semantic_token_str])
-        else:
-            break
-    delimiter = '\t'
-    filename = output_dir / "semantic_token.tsv"
-    with open(filename, 'w', encoding='utf-8') as writer:
-        for row in semantic_data:
-            line = delimiter.join(row)
-            writer.write(line + '\n')
-if __name__ == "__main__":
-    main()

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/text.txt DELETED Viewed

@@ -1,10 +0,0 @@
-001 Life was like a box of chocolates, you never know what you're gonna get.
-002 With great power there must come great responsibility.
-003 To be or not to be, that’s a question.
-004 A man can be destroyed but not defeated
-005 Do not, for one repulse, give up the purpose that you resolved to effort.
-006 Death is just a part of life, something we're all destined to do.
-007 I think it's hard winning a war with words.
-008 Don’t argue with the people of strong determination, because they may change the fact!
-009 Love you three thousand times.
-010 tidy tiger tied a tie tighter to tidy her tiny tall.

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/train.py DELETED Viewed

@@ -1,103 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py
-import argparse
-import logging
-import os
-from pathlib import Path
-import torch
-from pytorch_lightning import seed_everything
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint
-from pytorch_lightning.loggers import WandbLogger
-from pytorch_lightning.strategies import DDPStrategy
-from AR.data.data_module import Text2SemanticDataModule
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
-from soundstorm.utils.io import load_yaml_config
-logging.getLogger('numba').setLevel(logging.WARNING)
-logging.getLogger('matplotlib').setLevel(logging.WARNING)
-torch.set_float32_matmul_precision('high')
-from soundstorm.utils import get_newest_ckpt
-def main(args):
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    ckpt_dir = output_dir / 'ckpt'
-    ckpt_dir.mkdir(parents=True, exist_ok=True)
-    config = load_yaml_config(args.config_file)
-    seed_everything(config["train"]["seed"], workers=True)
-    ckpt_callback: ModelCheckpoint = ModelCheckpoint(
-        save_top_k=-1,
-        save_on_train_epoch_end=False,
-        every_n_epochs=config["train"]["save_every_n_epoch"],
-        dirpath=ckpt_dir)
-    logger = WandbLogger(
-        project="AR_S1",
-        name=output_dir.stem,
-        save_dir=output_dir,
-        # resume the loss curve
-        resume=True,
-        # id='k19kvsq8'
-    )
-    trainer: Trainer = Trainer(
-        max_epochs=config["train"]["epochs"],
-        accelerator='gpu',
-        devices=-1,
-        benchmark=False,
-        fast_dev_run=False,
-        strategy=DDPStrategy(find_unused_parameters=True),
-        precision=config["train"]["precision"],
-        logger=logger,
-        callbacks=[ckpt_callback])
-    model: Text2SemanticLightningModule = Text2SemanticLightningModule(
-        config, output_dir)
-    data_module: Text2SemanticDataModule = Text2SemanticDataModule(
-        config,
-        train_semantic_path=args.train_semantic_path,
-        train_phoneme_path=args.train_phoneme_path,
-        dev_semantic_path=args.dev_semantic_path,
-        dev_phoneme_path=args.dev_phoneme_path)
-    try:
-        # 使用正则表达式匹配文件名中的数字部分，并按数字大小进行排序
-        newest_ckpt_name = get_newest_ckpt(os.listdir(ckpt_dir))
-        ckpt_path = ckpt_dir / newest_ckpt_name
-    except Exception:
-        ckpt_path = None
-    print("ckpt_path:", ckpt_path)
-    trainer.fit(model, data_module, ckpt_path=ckpt_path)
-# srun --gpus-per-node=1 --ntasks-per-node=1 python train.py --path-to-configuration configurations/default.yaml
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--config_file',
-        type=str,
-        default='conf/default.yaml',
-        help='path of config file')
-    # args for dataset
-    parser.add_argument(
-        '--train_semantic_path',
-        type=str,
-        default='dump/train/semantic_token.tsv')
-    parser.add_argument(
-        '--train_phoneme_path', type=str, default='dump/train/phonemes.npy')
-    parser.add_argument(
-        '--dev_semantic_path', type=str, default='dump/dev/semantic_token.tsv')
-    parser.add_argument(
-        '--dev_phoneme_path', type=str, default='dump/dev/phonemes.npy')
-    parser.add_argument(
-        '--output_dir',
-        type=str,
-        default='exp/default',
-        help='directory to save the results')
-    args = parser.parse_args()
-    logging.info(str(args))
-    main(args)

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/exps/train_librilight_6k.py DELETED Viewed

@@ -1,170 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py
-import argparse
-import logging
-import os
-from pathlib import Path
-import torch
-from pytorch_lightning import seed_everything
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint
-from pytorch_lightning.loggers import WandbLogger
-from pytorch_lightning.strategies import DDPStrategy
-from AR.data.data_module_librilight_6k import Text2SemanticDataModule
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
-from soundstorm.utils import get_newest_ckpt
-from soundstorm.utils.io import load_yaml_config
-logging.getLogger('numba').setLevel(logging.WARNING)
-logging.getLogger('matplotlib').setLevel(logging.WARNING)
-torch.set_float32_matmul_precision('high')
-def main(args):
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    ckpt_dir = output_dir / 'ckpt'
-    ckpt_dir.mkdir(parents=True, exist_ok=True)
-    config = load_yaml_config(args.config_file)
-    seed_everything(config["train"]["seed"], workers=True)
-    ckpt_callback: ModelCheckpoint = ModelCheckpoint(
-        save_top_k=-1,
-        save_on_train_epoch_end=False,
-        every_n_train_steps=config["train"]["every_n_train_steps"],
-        dirpath=ckpt_dir)
-    logger = WandbLogger(
-        project="AR_S1_LibriLight",
-        name=output_dir.stem,
-        save_dir=output_dir,
-        # resume the loss curve
-        resume=True,
-        # id='k19kvsq8'
-    )
-    trainer: Trainer = Trainer(
-        max_epochs=config["train"]["epochs"],
-        accelerator='gpu',
-        devices=-1,
-        benchmark=False,
-        fast_dev_run=False,
-        strategy=DDPStrategy(find_unused_parameters=True),
-        precision=config["train"]["precision"],
-        logger=logger,
-        callbacks=[ckpt_callback])
-    model: Text2SemanticLightningModule = Text2SemanticLightningModule(
-        config, output_dir)
-    data_module: Text2SemanticDataModule = Text2SemanticDataModule(
-        config,
-        train_semantic_dirs=args.train_semantic_dirs,
-        train_phoneme_dirs=args.train_phoneme_dirs,
-        dev_semantic_dirs=args.dev_semantic_dirs,
-        dev_phoneme_dirs=args.dev_phoneme_dirs,
-        train_non_speech_dirs=args.train_non_speech_dirs,
-        dev_non_speech_dirs=args.dev_non_speech_dirs)
-    try:
-        newest_ckpt_name = get_newest_ckpt(os.listdir(ckpt_dir))
-        ckpt_path = ckpt_dir / newest_ckpt_name
-    except Exception:
-        ckpt_path = None
-    print("ckpt_path:", ckpt_path)
-    trainer.fit(model, data_module, ckpt_path=ckpt_path)
-# srun --gpus-per-node=1 --ntasks-per-node=1 python train.py --path-to-configuration configurations/default.yaml
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--config_file',
-        type=str,
-        default='conf/default.yaml',
-        help='path of config file')
-    # args for dataset
-    parser.add_argument(
-        '--train_semantic_dirs',
-        type=list,
-        nargs='+',
-        default=["dump/small/train/"],
-        help='dirs of train semantic')
-    parser.add_argument(
-        '--train_phoneme_dirs',
-        type=list,
-        nargs='+',
-        default=["dump/small/train/"],
-        help='dirs of train phoneme')
-    parser.add_argument(
-        '--dev_semantic_dirs',
-        type=list,
-        nargs='+',
-        default=["dump/small/dev/"],
-        help='dirs of dev semantic')
-    parser.add_argument(
-        '--dev_phoneme_dirs',
-        type=list,
-        nargs='+',
-        default=["dump/small/dev/"],
-        help='dirs of dev phoneme')
-    parser.add_argument(
-        '--output_dir',
-        type=str,
-        default='exp/default',
-        help='directory to save the results')
-    parser.add_argument(
-        '--train_non_speech_dirs',
-        type=list,
-        nargs='+',
-        default=None,
-        help='dirs of train non_speech data')
-    parser.add_argument(
-        '--dev_non_speech_dirs',
-        type=list,
-        nargs='+',
-        default=None,
-        help='dirs of dev non_speech data')
-    args = parser.parse_args()
-    new_train_semantic_dirs = []
-    new_train_phoneme_dirs = []
-    new_dev_semantic_dirs = []
-    new_dev_phoneme_dirs = []
-    new_train_non_speech_dirs = []
-    new_dev_non_speech_dirs = []
-    # format dataset dirs
-    for item in args.train_semantic_dirs:
-        new_train_semantic_dirs.append(''.join(item))
-    args.train_semantic_dirs = new_train_semantic_dirs
-    for item in args.train_phoneme_dirs:
-        new_train_phoneme_dirs.append(''.join(item))
-    args.train_phoneme_dirs = new_train_phoneme_dirs
-    for item in args.dev_semantic_dirs:
-        new_dev_semantic_dirs.append(''.join(item))
-    args.dev_semantic_dirs = new_dev_semantic_dirs
-    for item in args.dev_phoneme_dirs:
-        new_dev_phoneme_dirs.append(''.join(item))
-    args.dev_phoneme_dirs = new_dev_phoneme_dirs
-    if args.train_non_speech_dirs is not None:
-        for item in args.train_non_speech_dirs:
-            new_train_non_speech_dirs.append(''.join(item))
-        args.train_non_speech_dirs = new_train_non_speech_dirs
-    if args.dev_non_speech_dirs is not None:
-        for item in args.dev_non_speech_dirs:
-            new_dev_non_speech_dirs.append(''.join(item))
-        args.dev_non_speech_dirs = new_dev_non_speech_dirs
-    logging.info(str(args))
-    main(args)

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/models/__init__.py DELETED Viewed

File without changes

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/models/t2s_lightning_module.py DELETED Viewed

@@ -1,128 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py
-import os,sys
-now_dir = os.getcwd()
-sys.path.append(now_dir)
-from typing import Dict
-import torch
-from pytorch_lightning import LightningModule
-from AR.models.t2s_model import Text2SemanticDecoder
-from AR.modules.lr_schedulers import WarmupCosineLRSchedule
-from AR.modules.optim import ScaledAdam
-class Text2SemanticLightningModule(LightningModule):
-    def __init__(self, config, output_dir,is_train=True):
-        super().__init__()
-        self.config = config
-        self.top_k = 3
-        self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
-        pretrained_s1=config.get("pretrained_s1")
-        if(pretrained_s1 and is_train):
-            # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
-            print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["weight"]))
-        if is_train:
-            self.automatic_optimization = False
-            self.save_hyperparameters()
-            self.eval_dir = output_dir / 'eval'
-            self.eval_dir.mkdir(parents=True, exist_ok=True)
-    def training_step(self, batch: Dict, batch_idx: int):
-        opt = self.optimizers()
-        scheduler = self.lr_schedulers()
-        loss, acc = self.model.forward(
-            batch['phoneme_ids'], batch['phoneme_ids_len'],
-            batch['semantic_ids'], batch['semantic_ids_len'],
-            batch['bert_feature'])
-        self.manual_backward(loss)
-        if batch_idx > 0 and batch_idx % 4 == 0:
-            opt.step()
-            opt.zero_grad()
-            scheduler.step()
-        self.log(
-            "total_loss",
-            loss,
-            on_step=True,
-            on_epoch=True,
-            prog_bar=True,
-            sync_dist=True)
-        self.log(
-            "lr",
-            scheduler.get_last_lr()[0],
-            on_epoch=True,
-            prog_bar=True,
-            sync_dist=True)
-        self.log(
-            f"top_{self.top_k}_acc",
-            acc,
-            on_step=True,
-            on_epoch=True,
-            prog_bar=True,
-            sync_dist=True)
-    def validation_step(self, batch: Dict, batch_idx: int):return
-        # # get loss
-        # loss, acc = self.model.forward(
-        #     batch['phoneme_ids'], batch['phoneme_ids_len'],
-        #     batch['semantic_ids'], batch['semantic_ids_len'],
-        #     batch['bert_feature']
-        # )
-        #
-        # self.log(
-        #     "val_total_loss",
-        #     loss,
-        #     on_step=True,
-        #     on_epoch=True,
-        #     prog_bar=True,
-        #     sync_dist=True)
-        # self.log(
-        #     f"val_top_{self.top_k}_acc",
-        #     acc,
-        #     on_step=True,
-        #     on_epoch=True,
-        #     prog_bar=True,
-        #     sync_dist=True)
-        #
-        # # get infer output
-        # semantic_len = batch['semantic_ids'].size(1)
-        # prompt_len = min(int(semantic_len * 0.5), 150)
-        # prompt = batch['semantic_ids'][:, :prompt_len]
-        # pred_semantic = self.model.infer(batch['phoneme_ids'],
-        #                                  batch['phoneme_ids_len'], prompt,
-        #                                  batch['bert_feature']
-        #                                  )
-        # save_name = f'semantic_toks_{batch_idx}.pt'
-        # save_path = os.path.join(self.eval_dir, save_name)
-        # torch.save(pred_semantic.detach().cpu(), save_path)
-    def configure_optimizers(self):
-        model_parameters = self.model.parameters()
-        parameters_names = []
-        parameters_names.append([
-            name_param_pair[0]
-            for name_param_pair in self.model.named_parameters()
-        ])
-        lm_opt = ScaledAdam(
-            model_parameters,
-            lr=0.01,
-            betas=(0.9, 0.95),
-            clipping_scale=2.0,
-            parameters_names=parameters_names,
-            show_dominant_parameters=False,
-            clipping_update_period=1000, )
-        return {
-            "optimizer": lm_opt,
-            "lr_scheduler": {
-                "scheduler":
-                WarmupCosineLRSchedule(
-                    lm_opt,
-                    init_lr=self.config['optimizer']['lr_init'],
-                    peak_lr=self.config['optimizer']['lr'],
-                    end_lr=self.config['optimizer']['lr_end'],
-                    warmup_steps=self.config['optimizer']['warmup_steps'],
-                    total_steps=self.config['optimizer']['decay_steps'])
-            }
-        }

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/models/t2s_model.py DELETED Viewed

@@ -1,298 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_model.py
-import torch
-from tqdm import tqdm
-from AR.models.utils import make_pad_mask
-from AR.models.utils import topk_sampling,sample,logits_to_probs,multinomial_sample_one_no_sync
-from AR.modules.embedding import SinePositionalEmbedding
-from AR.modules.embedding import TokenEmbedding
-from AR.modules.transformer import LayerNorm
-from AR.modules.transformer import TransformerEncoder
-from AR.modules.transformer import TransformerEncoderLayer
-from torch import nn
-from torch.nn import functional as F
-from torchmetrics.classification import MulticlassAccuracy
-default_config = {
-    "embedding_dim": 512,
-    "hidden_dim": 512,
-    "num_head": 8,
-    "num_layers": 12,
-    "num_codebook": 8,
-    "p_dropout": 0.0,
-    "vocab_size": 1024 + 1,
-    "phoneme_vocab_size": 512,
-    "EOS": 1024
-}
-class Text2SemanticDecoder(nn.Module):
-    def __init__(self, config, norm_first=False, top_k=3):
-        super(Text2SemanticDecoder, self).__init__()
-        self.model_dim = config['model']["hidden_dim"]
-        self.embedding_dim = config['model']["embedding_dim"]
-        self.num_head = config['model']["head"]
-        self.num_layers = config['model']["n_layer"]
-        self.norm_first = norm_first
-        self.vocab_size = config['model']["vocab_size"]
-        self.phoneme_vocab_size = config['model']["phoneme_vocab_size"]
-        self.p_dropout = config['model']["dropout"]
-        self.EOS = config['model']["EOS"]
-        self.norm_first = norm_first
-        assert self.EOS == self.vocab_size - 1
-        # should be same as num of kmeans bin
-        # assert self.EOS == 1024
-        self.bert_proj = nn.Linear(1024, self.embedding_dim)
-        self.ar_text_embedding = TokenEmbedding(
-            self.embedding_dim, self.phoneme_vocab_size, self.p_dropout)
-        self.ar_text_position = SinePositionalEmbedding(
-            self.embedding_dim, dropout=0.1, scale=False, alpha=True)
-        self.ar_audio_embedding = TokenEmbedding(
-            self.embedding_dim, self.vocab_size, self.p_dropout)
-        self.ar_audio_position = SinePositionalEmbedding(
-            self.embedding_dim, dropout=0.1, scale=False, alpha=True)
-        self.h = TransformerEncoder(
-            TransformerEncoderLayer(
-                d_model=self.model_dim,
-                nhead=self.num_head,
-                dim_feedforward=self.model_dim * 4,
-                dropout=0.1,
-                batch_first=True,
-                norm_first=norm_first, ),
-            num_layers=self.num_layers,
-            norm=LayerNorm(self.model_dim) if norm_first else None, )
-        self.ar_predict_layer = nn.Linear(
-            self.model_dim, self.vocab_size, bias=False)
-        self.loss_fct = nn.CrossEntropyLoss(reduction='sum')
-        self.ar_accuracy_metric = MulticlassAccuracy(
-            self.vocab_size,
-            top_k=top_k,
-            average="micro",
-            multidim_average="global",
-            ignore_index=self.EOS, )
-    def forward(self, x, x_lens, y, y_lens, bert_feature):
-        '''
-        x: phoneme_ids
-        y: semantic_ids
-        '''
-        x = self.ar_text_embedding(x)
-        x = x + self.bert_proj(bert_feature.transpose(1,2))
-        x = self.ar_text_position(x)
-        x_mask = make_pad_mask(x_lens)
-        y_mask = make_pad_mask(y_lens)
-        y_mask_int = y_mask.type(torch.int64)
-        codes = y.type(torch.int64) * (1 - y_mask_int)
-        # Training
-        # AR Decoder
-        y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS)
-        x_len = x_lens.max()
-        y_len = y_lens.max()
-        y_emb = self.ar_audio_embedding(y)
-        y_pos = self.ar_audio_position(y_emb)
-        xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
-        ar_xy_padding_mask = xy_padding_mask
-        x_attn_mask = F.pad(
-            torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device),
-            (0, y_len),
-            value=True, )
-        y_attn_mask = F.pad(
-            torch.triu(
-                torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
-                diagonal=1, ),
-            (x_len, 0),
-            value=False, )
-        xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0)
-        bsz, src_len = x.shape[0], x_len + y_len
-        _xy_padding_mask = (ar_xy_padding_mask.view(bsz, 1, 1, src_len)
-                            .expand(-1, self.num_head, -1, -1)
-                            .reshape(bsz * self.num_head, 1, src_len))
-        xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask)
-        new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
-        new_attn_mask.masked_fill_(xy_attn_mask, float("-inf"))
-        xy_attn_mask = new_attn_mask
-        # x 和完整的 y 一次性输入模型
-        xy_pos = torch.concat([x, y_pos], dim=1)
-        xy_dec, _ = self.h(
-            (xy_pos, None),
-            mask=xy_attn_mask, )
-        logits = self.ar_predict_layer(xy_dec[:, x_len:]).permute(0, 2, 1)
-        # loss
-        # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
-        loss = F.cross_entropy(logits, targets, reduction='sum')
-        acc = self.ar_accuracy_metric(logits.detach(), targets).item()
-        return loss, acc
-    # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么
-    def infer(self,
-              x,
-              x_lens,
-              prompts,
-              bert_feature,
-              top_k: int=-100,
-              early_stop_num: int=-1,
-              temperature: float=1.0):
-        x = self.ar_text_embedding(x)
-        x = x + self.bert_proj(bert_feature.transpose(1,2))
-        x = self.ar_text_position(x)
-        # AR Decoder
-        y = prompts
-        prefix_len = y.shape[1]
-        x_len = x.shape[1]
-        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
-        stop = False
-        for _ in tqdm(range(1500)):
-            y_emb = self.ar_audio_embedding(y)
-            y_pos = self.ar_audio_position(y_emb)
-            # x 和逐渐增长的 y 一起输入给模型
-            xy_pos = torch.concat([x, y_pos], dim=1)
-            y_len = y.shape[1]
-            x_attn_mask_pad = F.pad(
-                x_attn_mask,
-                (0, y_len),
-                value=True, )
-            y_attn_mask = F.pad(
-                torch.triu(
-                    torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
-                (x_len, 0),
-                value=False, )
-            xy_attn_mask = torch.concat(
-                [x_attn_mask_pad, y_attn_mask], dim=0).to(y.device)
-            xy_dec, _ = self.h(
-                (xy_pos, None),
-                mask=xy_attn_mask, )
-            logits = self.ar_predict_layer(xy_dec[:, -1])
-            samples = topk_sampling(
-                logits, top_k=top_k, top_p=1.0, temperature=temperature)
-            if early_stop_num != -1 and (y.shape[1] - prefix_len
-                                         ) > early_stop_num:
-                print("use early stop num:", early_stop_num)
-                stop = True
-            if torch.argmax(
-                    logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
-                # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
-                stop = True
-            if stop:
-                if prompts.shape[1] == y.shape[1]:
-                    y = torch.concat([y, torch.zeros_like(samples)], dim=1)
-                    print('bad zero prediction')
-                print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
-                break
-            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
-            # print(samples.shape)#[1,1]#第一个1是bs
-            # import os
-            # os._exit(2333)
-            y = torch.concat([y, samples], dim=1)
-        return y
-    def pad_y_eos(self, y, y_mask_int, eos_id):
-        targets = F.pad(
-            y, (0, 1), value=0) + eos_id * F.pad(
-                y_mask_int, (0, 1), value=1)
-        # 错位
-        return targets[:, :-1], targets[:, 1:]
-    def infer_panel(self,
-              x,#####全部文本token
-              x_lens,
-              prompts,####参考音频token
-              bert_feature,
-              top_k: int=-100,
-              early_stop_num: int=-1,
-              temperature: float=1.0):
-        x = self.ar_text_embedding(x)
-        x = x + self.bert_proj(bert_feature.transpose(1,2))
-        x = self.ar_text_position(x)
-        # AR Decoder
-        y = prompts
-        prefix_len = y.shape[1]
-        x_len = x.shape[1]
-        x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
-        stop = False
-        # print(1111111,self.num_layers)
-        cache={
-            "all_stage":self.num_layers,
-            "k":[None]*self.num_layers,###根据配置自己手写
-            "v":[None]*self.num_layers,
-            # "xy_pos":None,##y_pos位置编码每次都不一样的没法缓存，每次都要重新拼xy_pos.主要还是写法原因，其实是可以历史统一一样的，但也没啥计算量就不管了
-            "y_emb":None,##只需要对最新的samples求emb，再拼历史的就行
-            # "logits":None,###原版就已经只对结尾求再拼接了，不用管
-            # "xy_dec":None,###不需要，本来只需要最后一个做logits
-            "first_infer":1,
-            "stage":0
-        }
-        for idx in tqdm(range(1500)):
-            if(cache["first_infer"]==1):
-                y_emb = self.ar_audio_embedding(y)
-            else:
-                y_emb = torch.cat([cache["y_emb"],self.ar_audio_embedding(y[:,-1:])],1)
-            cache["y_emb"]=y_emb
-            y_pos = self.ar_audio_position(y_emb)
-            # x 和逐渐增长的 y 一起输入给模型
-            if(cache["first_infer"]==1):
-                xy_pos = torch.concat([x, y_pos], dim=1)
-            else:
-                xy_pos=y_pos[:,-1:]
-            y_len = y_pos.shape[1]
-            ###以下3个不做缓存
-            if (cache["first_infer"] == 1):
-                x_attn_mask_pad = F.pad(
-                        x_attn_mask,
-                        (0, y_len),###xx的纯0扩展到xx纯0+xy纯1，(x,x+y)
-                        value=True, )
-                y_attn_mask = F.pad(###yy的右上1扩展到左边xy的0,(y,x+y)
-                    torch.triu(
-                        torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
-                    (x_len, 0),
-                    value=False, )
-                xy_attn_mask = torch.concat(
-                    [x_attn_mask_pad, y_attn_mask], dim=0).to(y.device)
-            else:
-                ###最右边一列（是错的）
-                # xy_attn_mask=torch.ones((1, x_len+y_len), dtype=torch.bool,device=xy_pos.device)
-                # xy_attn_mask[:,-1]=False
-                ###最下面一行（是对的）
-                xy_attn_mask = torch.zeros((1, x_len + y_len), dtype=torch.bool, device=xy_pos.device)
-            # pdb.set_trace()
-            ###缓存重头戏
-            # print(1111,xy_pos.shape,xy_attn_mask.shape,x_len,y_len)
-            xy_dec, _ = self.h(
-                (xy_pos, None),
-                mask=xy_attn_mask,cache=cache )
-            logits = self.ar_predict_layer(xy_dec[:, -1])##不用改，如果用了cache的默认就是只有一帧，取最后一帧一样的
-            # samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
-            samples = sample(logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
-            if early_stop_num != -1 and (y.shape[1] - prefix_len
-                                         ) > early_stop_num:
-                print("use early stop num:", early_stop_num)
-                stop = True
-            if torch.argmax(
-                    logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
-                # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
-                stop = True
-            if stop:
-                if prompts.shape[1] == y.shape[1]:
-                    y = torch.concat([y, torch.zeros_like(samples)], dim=1)
-                    print('bad zero prediction')
-                print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
-                break
-            # 本次生成的 semantic_ids 和之前的 y 构成新的 y
-            # print(samples.shape)#[1,1]#第一个1是bs
-            y = torch.concat([y, samples], dim=1)
-            cache["first_infer"]=0
-        return y,idx

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/models/utils.py DELETED Viewed

@@ -1,164 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/utils.py\
-import torch
-import torch.nn.functional as F
-import torchaudio
-def sequence_mask(length, max_length=None):
-    if max_length is None:
-        max_length = length.max()
-    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-    return x.unsqueeze(0) < length.unsqueeze(1)
-def make_pad_mask(lengths: torch.Tensor, max_len: int=0) -> torch.Tensor:
-    """
-    Args:
-      lengths:
-        A 1-D tensor containing sentence lengths.
-      max_len:
-        The length of masks.
-    Returns:
-      Return a 2-D bool tensor, where masked positions
-      are filled with `True` and non-masked positions are
-      filled with `False`.
-    #>>> lengths = torch.tensor([1, 3, 2, 5])
-    #>>> make_pad_mask(lengths)
-    tensor([[False,  True,  True,  True,  True],
-            [False, False, False,  True,  True],
-            [False, False,  True,  True,  True],
-            [False, False, False, False, False]])
-    """
-    assert lengths.ndim == 1, lengths.ndim
-    max_len = max(max_len, lengths.max())
-    n = lengths.size(0)
-    seq_range = torch.arange(0, max_len, device=lengths.device)
-    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
-    return expaned_lengths >= lengths.unsqueeze(-1)
-# https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
-def top_k_top_p_filtering(logits,
-                          top_k=0,
-                          top_p=1.0,
-                          filter_value=-float("Inf"),
-                          min_tokens_to_keep=1):
-    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-    Args:
-        logits: logits distribution shape (batch size, vocabulary size)
-        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-        Make sure we keep at least min_tokens_to_keep per batch example in the output
-    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
-    """
-    if top_k > 0:
-        top_k = min(max(top_k, min_tokens_to_keep),
-                    logits.size(-1))  # Safety check
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        logits[indices_to_remove] = filter_value
-    if top_p < 1.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-        cumulative_probs = torch.cumsum(
-            F.softmax(sorted_logits, dim=-1), dim=-1)
-        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs > top_p
-        if min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
-            ..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-        # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(
-            1, sorted_indices, sorted_indices_to_remove)
-        logits[indices_to_remove] = filter_value
-    return logits
-def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0):
-    # temperature: (`optional`) float
-    #     The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
-    # top_k: (`optional`) int
-    #     The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
-    # top_p: (`optional`) float
-    #     The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
-    # Temperature (higher temperature => more likely to sample low probability tokens)
-    if temperature != 1.0:
-        logits = logits / temperature
-    # Top-p/top-k filtering
-    logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
-    # Sample
-    token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
-    return token
-from typing import Optional, Tuple
-def multinomial_sample_one_no_sync(
-    probs_sort,
-):  # Does multinomial sampling without a cuda synchronization
-    q = torch.empty_like(probs_sort).exponential_(1)
-    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
-def logits_to_probs(
-    logits,
-    previous_tokens: Optional[torch.Tensor] = None,
-    temperature: float = 1.0,
-    top_k: Optional[int] = None,
-    top_p: Optional[int] = None,
-    repetition_penalty: float = 1.0,
-):
-    previous_tokens=previous_tokens.squeeze()
-    # print(logits.shape,previous_tokens.shape)
-    # pdb.set_trace()
-    if previous_tokens is not None and repetition_penalty != 1.0:
-        previous_tokens = previous_tokens.long()
-        score = torch.gather(logits, dim=0, index=previous_tokens)
-        score = torch.where(
-            score < 0, score * repetition_penalty, score / repetition_penalty
-        )
-        logits.scatter_(dim=0, index=previous_tokens, src=score)
-    if top_p is not None and top_p < 1.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-        cum_probs = torch.cumsum(
-            torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1
-        )
-        sorted_indices_to_remove = cum_probs > top_p
-        sorted_indices_to_remove[0] = False  # keep at least one option
-        indices_to_remove = sorted_indices_to_remove.scatter(
-            dim=0, index=sorted_indices, src=sorted_indices_to_remove
-        )
-        logits = logits.masked_fill(indices_to_remove, -float("Inf"))
-    logits = logits / max(temperature, 1e-5)
-    if top_k is not None:
-        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-        pivot = v.select(-1, -1).unsqueeze(-1)
-        logits = torch.where(logits < pivot, -float("Inf"), logits)
-    probs = torch.nn.functional.softmax(logits, dim=-1)
-    return probs
-def sample(
-    logits,
-    previous_tokens: Optional[torch.Tensor] = None,
-    **sampling_kwargs,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    probs = logits_to_probs(
-        logits=logits, previous_tokens=previous_tokens, **sampling_kwargs
-    )
-    idx_next = multinomial_sample_one_no_sync(probs)
-    return idx_next, probs

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/__init__.py DELETED Viewed

File without changes

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/activation.py DELETED Viewed

@@ -1,397 +0,0 @@
-# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py
-from typing import Optional
-from typing import Tuple
-import torch
-from torch import Tensor
-from torch.nn import Linear
-from torch.nn import Module
-from torch.nn.init import constant_
-from torch.nn.init import xavier_normal_
-from torch.nn.init import xavier_uniform_
-from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
-from torch.nn.parameter import Parameter
-from torch.nn import functional as F
-from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched
-F.multi_head_attention_forward=multi_head_attention_forward_patched
-class MultiheadAttention(Module):
-    r"""Allows the model to jointly attend to information
-    from different representation subspaces as described in the paper:
-    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    Multi-Head Attention is defined as:
-    .. math::
-        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
-    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
-    ``forward()`` will use a special optimized implementation if all of the following
-    conditions are met:
-    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This
-      restriction will be loosened in the future.)
-    - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
-    - training is disabled (using ``.eval()``)
-    - dropout is 0
-    - ``add_bias_kv`` is ``False``
-    - ``add_zero_attn`` is ``False``
-    - ``batch_first`` is ``True`` and the input is batched
-    - ``kdim`` and ``vdim`` are equal to ``embed_dim``
-    - at most one of ``key_padding_mask`` or ``attn_mask`` is passed
-    - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
-      nor ``attn_mask`` is passed
-    If the optimized implementation is in use, a
-    `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
-    ``query``/``key``/``value`` to represent padding more efficiently than using a
-    padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
-    will be returned, and an additional speedup proportional to the fraction of the input
-    that is padding can be expected.
-    Args:
-        embed_dim: Total dimension of the model.
-        num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
-            across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
-        dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
-        bias: If specified, adds bias to input / output projection layers. Default: ``True``.
-        add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
-        add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
-            Default: ``False``.
-        kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
-        vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
-        batch_first: If ``True``, then the input and output tensors are provided
-            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
-    Examples::
-        >>> # xdoctest: +SKIP
-        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
-    """
-    __constants__ = ["batch_first"]
-    bias_k: Optional[torch.Tensor]
-    bias_v: Optional[torch.Tensor]
-    def __init__(
-            self,
-            embed_dim,
-            num_heads,
-            dropout=0.0,
-            bias=True,
-            add_bias_kv=False,
-            add_zero_attn=False,
-            kdim=None,
-            vdim=None,
-            batch_first=False,
-            linear1_cls=Linear,
-            linear2_cls=Linear,
-            device=None,
-            dtype=None, ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super(MultiheadAttention, self).__init__()
-        self.embed_dim = embed_dim
-        self.kdim = kdim if kdim is not None else embed_dim
-        self.vdim = vdim if vdim is not None else embed_dim
-        self._qkv_same_embed_dim = (self.kdim == embed_dim and
-                                    self.vdim == embed_dim)
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.batch_first = batch_first
-        self.head_dim = embed_dim // num_heads
-        assert (self.head_dim * num_heads == self.embed_dim
-                ), "embed_dim must be divisible by num_heads"
-        if add_bias_kv:
-            self.bias_k = Parameter(
-                torch.empty((1, 1, embed_dim), **factory_kwargs))
-            self.bias_v = Parameter(
-                torch.empty((1, 1, embed_dim), **factory_kwargs))
-        else:
-            self.bias_k = self.bias_v = None
-        if linear1_cls == Linear:
-            if not self._qkv_same_embed_dim:
-                self.q_proj_weight = Parameter(
-                    torch.empty((embed_dim, embed_dim), **factory_kwargs))
-                self.k_proj_weight = Parameter(
-                    torch.empty((embed_dim, self.kdim), **factory_kwargs))
-                self.v_proj_weight = Parameter(
-                    torch.empty((embed_dim, self.vdim), **factory_kwargs))
-                self.register_parameter("in_proj_weight", None)
-            else:
-                self.in_proj_weight = Parameter(
-                    torch.empty((3 * embed_dim, embed_dim), **factory_kwargs))
-                self.register_parameter("q_proj_weight", None)
-                self.register_parameter("k_proj_weight", None)
-                self.register_parameter("v_proj_weight", None)
-            if bias:
-                self.in_proj_bias = Parameter(
-                    torch.empty(3 * embed_dim, **factory_kwargs))
-            else:
-                self.register_parameter("in_proj_bias", None)
-            self.out_proj = NonDynamicallyQuantizableLinear(
-                embed_dim, embed_dim, bias=bias, **factory_kwargs)
-            self._reset_parameters()
-        else:
-            if not self._qkv_same_embed_dim:
-                raise NotImplementedError
-            else:
-                self.in_proj_linear = linear1_cls(
-                    embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs)
-                self.in_proj_weight = self.in_proj_linear.weight
-                self.register_parameter("q_proj_weight", None)
-                self.register_parameter("k_proj_weight", None)
-                self.register_parameter("v_proj_weight", None)
-                if bias:
-                    self.in_proj_bias = self.in_proj_linear.bias
-                else:
-                    self.register_parameter("in_proj_bias", None)
-            self.out_proj = linear2_cls(
-                embed_dim, embed_dim, bias=bias, **factory_kwargs)
-            if self.bias_k is not None:
-                xavier_normal_(self.bias_k)
-            if self.bias_v is not None:
-                xavier_normal_(self.bias_v)
-        self.add_zero_attn = add_zero_attn
-    def _reset_parameters(self):
-        if self._qkv_same_embed_dim:
-            xavier_uniform_(self.in_proj_weight)
-        else:
-            xavier_uniform_(self.q_proj_weight)
-            xavier_uniform_(self.k_proj_weight)
-            xavier_uniform_(self.v_proj_weight)
-        if self.in_proj_bias is not None:
-            constant_(self.in_proj_bias, 0.0)
-            constant_(self.out_proj.bias, 0.0)
-        if self.bias_k is not None:
-            xavier_normal_(self.bias_k)
-        if self.bias_v is not None:
-            xavier_normal_(self.bias_v)
-    def __setstate__(self, state):
-        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
-        if "_qkv_same_embed_dim" not in state:
-            state["_qkv_same_embed_dim"] = True
-        super(MultiheadAttention, self).__setstate__(state)
-    def forward(
-            self,
-            query: Tensor,
-            key: Tensor,
-            value: Tensor,
-            key_padding_mask: Optional[Tensor]=None,
-            need_weights: bool=True,
-            attn_mask: Optional[Tensor]=None,
-            average_attn_weights: bool=True,cache=None
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        r"""
-        Args:
-            query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
-                or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
-                :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
-                Queries are compared against key-value pairs to produce the output.
-                See "Attention Is All You Need" for more details.
-            key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
-                or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
-                :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
-                See "Attention Is All You Need" for more details.
-            value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
-                ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
-                sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
-                See "Attention Is All You Need" for more details.
-            key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
-                to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
-                Binary and byte masks are supported.
-                For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
-                the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
-            need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
-                Default: ``True``.
-            attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
-                :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
-                :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
-                broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
-                Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
-                corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
-                corresponding position is not allowed to attend. For a float mask, the mask values will be added to
-                the attention weight.
-            average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
-                heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
-                effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
-        Outputs:
-            - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
-              :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
-              where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
-              embedding dimension ``embed_dim``.
-            - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
-              returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
-              :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
-              :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
-              head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
-            .. note::
-                `batch_first` argument is ignored for unbatched inputs.
-        """
-        is_batched = query.dim() == 3
-        if key_padding_mask is not None:
-            _kpm_dtype = key_padding_mask.dtype
-            if _kpm_dtype != torch.bool and not torch.is_floating_point(
-                    key_padding_mask):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported"
-                )
-        why_not_fast_path = ""
-        if not is_batched:
-            why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
-        elif query is not key or key is not value:
-            # When lifting this restriction, don't forget to either
-            # enforce that the dtypes all match or test cases where
-            # they don't!
-            why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
-        elif (self.in_proj_bias is not None and
-              query.dtype != self.in_proj_bias.dtype):
-            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
-        elif (self.in_proj_weight is not None and
-              query.dtype != self.in_proj_weight.dtype):
-            # this case will fail anyway, but at least they'll get a useful error message.
-            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
-        elif self.training:
-            why_not_fast_path = "training is enabled"
-        elif not self.batch_first:
-            why_not_fast_path = "batch_first was not True"
-        elif self.bias_k is not None:
-            why_not_fast_path = "self.bias_k was not None"
-        elif self.bias_v is not None:
-            why_not_fast_path = "self.bias_v was not None"
-        elif self.dropout:
-            why_not_fast_path = f"dropout was {self.dropout}, required zero"
-        elif self.add_zero_attn:
-            why_not_fast_path = "add_zero_attn was enabled"
-        elif not self._qkv_same_embed_dim:
-            why_not_fast_path = "_qkv_same_embed_dim was not True"
-        elif attn_mask is not None:
-            why_not_fast_path = "attn_mask was not None"
-        elif query.is_nested and key_padding_mask is not None:
-            why_not_fast_path = (
-                "key_padding_mask is not supported with NestedTensor input")
-        elif self.num_heads % 2 == 1:
-            why_not_fast_path = "num_heads is odd"
-        elif torch.is_autocast_enabled():
-            why_not_fast_path = "autocast is enabled"
-        if not why_not_fast_path:
-            tensor_args = (query, key, value, self.in_proj_weight,
-                           self.in_proj_bias, self.out_proj.weight,
-                           self.out_proj.bias, )
-            # We have to use list comprehensions below because TorchScript does not support
-            # generator expressions.
-            if torch.overrides.has_torch_function(tensor_args):
-                why_not_fast_path = "some Tensor argument has_torch_function"
-            elif not all([(x is None or x.is_cuda or "cpu" in str(x.device))
-                          for x in tensor_args]):
-                why_not_fast_path = (
-                    "some Tensor argument is neither CUDA nor CPU")
-            elif torch.is_grad_enabled() and any(
-                [x is not None and x.requires_grad for x in tensor_args]):
-                why_not_fast_path = (
-                    "grad is enabled and at least one of query or the "
-                    "input/output projection weights or biases requires_grad")
-            if not why_not_fast_path:
-                return torch._native_multi_head_attention(
-                    query,
-                    key,
-                    value,
-                    self.embed_dim,
-                    self.num_heads,
-                    self.in_proj_weight,
-                    self.in_proj_bias,
-                    self.out_proj.weight,
-                    self.out_proj.bias,
-                    key_padding_mask
-                    if key_padding_mask is not None else attn_mask,
-                    need_weights,
-                    average_attn_weights,
-                    1 if key_padding_mask is not None else 0
-                    if attn_mask is not None else None, )
-        any_nested = query.is_nested or key.is_nested or value.is_nested
-        assert not any_nested, (
-            "MultiheadAttention does not support NestedTensor outside of its fast path. "
-            + f"The fast path was not hit because {why_not_fast_path}")
-        if self.batch_first and is_batched:
-            # make sure that the transpose op does not affect the "is" property
-            if key is value:
-                if query is key:
-                    query = key = value = query.transpose(1, 0)
-                else:
-                    query, key = [x.transpose(1, 0) for x in (query, key)]
-                    value = key
-            else:
-                query, key, value = [
-                    x.transpose(1, 0) for x in (query, key, value)
-                ]
-        if not self._qkv_same_embed_dim:
-            attn_output, attn_output_weights = F.multi_head_attention_forward(
-                query,
-                key,
-                value,
-                self.embed_dim,
-                self.num_heads,
-                self.in_proj_weight,
-                self.in_proj_bias,
-                self.bias_k,
-                self.bias_v,
-                self.add_zero_attn,
-                self.dropout,
-                self.out_proj.weight,
-                self.out_proj.bias,
-                training=self.training,
-                key_padding_mask=key_padding_mask,
-                need_weights=need_weights,
-                attn_mask=attn_mask,
-                use_separate_proj_weight=True,
-                q_proj_weight=self.q_proj_weight,
-                k_proj_weight=self.k_proj_weight,
-                v_proj_weight=self.v_proj_weight,
-                average_attn_weights=average_attn_weights,cache=cache )
-        else:
-            attn_output, attn_output_weights = F.multi_head_attention_forward(
-                query,
-                key,
-                value,
-                self.embed_dim,
-                self.num_heads,
-                self.in_proj_weight,
-                self.in_proj_bias,
-                self.bias_k,
-                self.bias_v,
-                self.add_zero_attn,
-                self.dropout,
-                self.out_proj.weight,
-                self.out_proj.bias,
-                training=self.training,
-                key_padding_mask=key_padding_mask,
-                need_weights=need_weights,
-                attn_mask=attn_mask,
-                average_attn_weights=average_attn_weights,cache=cache )
-        if self.batch_first and is_batched:
-            return attn_output.transpose(1, 0), attn_output_weights
-        else:
-            return attn_output, attn_output_weights

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/embedding.py DELETED Viewed

@@ -1,78 +0,0 @@
-# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
-import math
-import torch
-from torch import nn
-class TokenEmbedding(nn.Module):
-    def __init__(
-            self,
-            embedding_dim: int,
-            vocab_size: int,
-            dropout: float=0.0, ):
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.embedding_dim = embedding_dim
-        self.dropout = torch.nn.Dropout(p=dropout)
-        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
-    @property
-    def weight(self) -> torch.Tensor:
-        return self.word_embeddings.weight
-    def embedding(self, index: int) -> torch.Tensor:
-        return self.word_embeddings.weight[index:index + 1]
-    def forward(self, x: torch.Tensor):
-        x = self.word_embeddings(x)
-        x = self.dropout(x)
-        return x
-class SinePositionalEmbedding(nn.Module):
-    def __init__(
-            self,
-            embedding_dim: int,
-            dropout: float=0.0,
-            scale: bool=False,
-            alpha: bool=False, ):
-        super().__init__()
-        self.embedding_dim = embedding_dim
-        self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
-        self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
-        self.dropout = torch.nn.Dropout(p=dropout)
-        self.reverse = False
-        self.pe = None
-        self.extend_pe(torch.tensor(0.0).expand(1, 4000))
-    def extend_pe(self, x):
-        """Reset the positional encodings."""
-        if self.pe is not None:
-            if self.pe.size(1) >= x.size(1):
-                if self.pe.dtype != x.dtype or self.pe.device != x.device:
-                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
-                return
-        pe = torch.zeros(x.size(1), self.embedding_dim)
-        if self.reverse:
-            position = torch.arange(
-                x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
-        else:
-            position = torch.arange(
-                0, x.size(1), dtype=torch.float32).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) *
-            -(math.log(10000.0) / self.embedding_dim))
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0)
-        self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        self.extend_pe(x)
-        output = x.unsqueeze(-1) if x.ndim == 2 else x
-        output = output * self.x_scale + self.alpha * self.pe[:, :x.size(1)]
-        return self.dropout(output)

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/lr_schedulers.py DELETED Viewed

@@ -1,85 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/lr_schedulers.py
-import math
-import torch
-from matplotlib import pyplot as plt
-from torch import nn
-from torch.optim import Adam
-class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
-    """
-    Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
-    """
-    def __init__(self,
-                 optimizer,
-                 init_lr,
-                 peak_lr,
-                 end_lr,
-                 warmup_steps=10000,
-                 total_steps=400000,
-                 current_step=0):
-        self.init_lr = init_lr
-        self.peak_lr = peak_lr
-        self.end_lr = end_lr
-        self.optimizer = optimizer
-        self._warmup_rate = (peak_lr - init_lr) / warmup_steps
-        self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
-        self._current_step = current_step
-        self.lr = init_lr
-        self.warmup_steps = warmup_steps
-        self.total_steps = total_steps
-        self._last_lr = [self.lr]
-    def set_lr(self, lr):
-        self._last_lr = [g['lr'] for g in self.optimizer.param_groups]
-        for g in self.optimizer.param_groups:
-            # g['lr'] = lr
-            g['lr'] = self.end_lr###锁定用线性
-    def step(self):
-        if self._current_step < self.warmup_steps:
-            lr = self.init_lr + self._warmup_rate * self._current_step
-        elif self._current_step > self.total_steps:
-            lr = self.end_lr
-        else:
-            decay_ratio = (self._current_step - self.warmup_steps) / (
-                self.total_steps - self.warmup_steps)
-            if decay_ratio < 0.0 or decay_ratio > 1.0:
-                raise RuntimeError(
-                    "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings."
-                )
-            coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
-            lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
-        self.lr=lr=self.end_lr=0.002###锁定用线性###不听话，直接锁定！
-        self.set_lr(lr)
-        self.lr = lr
-        self._current_step += 1
-        return self.lr
-if __name__ == '__main__':
-    m = nn.Linear(10, 10)
-    opt = Adam(m.parameters(), lr=1e-4)
-    s = WarmupCosineLRSchedule(
-        opt,
-        1e-6,
-        2e-4,
-        1e-6,
-        warmup_steps=2000,
-        total_steps=20000,
-        current_step=0)
-    lrs = []
-    for i in range(25000):
-        s.step()
-        lrs.append(s.lr)
-        print(s.lr)
-    plt.plot(lrs)
-    plt.plot(range(0, 25000), lrs)
-    plt.show()

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/optim.py DELETED Viewed

@@ -1,622 +0,0 @@
-# Copyright      2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import contextlib
-import logging
-from collections import defaultdict
-from typing import List
-from typing import Tuple
-import torch
-from torch import Tensor
-from torch.optim import Optimizer
-class BatchedOptimizer(Optimizer):
-    """
-    This class adds to class Optimizer the capability to optimize parameters in batches:
-    it will stack the parameters and their grads for you so the optimizer can work
-    on tensors with an extra leading dimension.  This is intended for speed with GPUs,
-    as it reduces the number of kernels launched in the optimizer.
-    Args:
-      params:
-    """
-    def __init__(self, params, defaults):
-        super(BatchedOptimizer, self).__init__(params, defaults)
-    @contextlib.contextmanager
-    def batched_params(self, param_group, group_params_names):
-        """
-        This function returns (technically, yields) a list of
-          of tuples (p, state), where
-        p is a `fake` parameter that is stacked (over axis 0) from real parameters
-        that share the same shape, and its gradient is also stacked;
-        `state` is the state corresponding to this batch of parameters
-        (it will be physically located in the "state" for one of the real
-        parameters, the last one that has any particular shape and dtype).
-        This function is decorated as a context manager so that it can
-        write parameters back to their "real" locations.
-        The idea is, instead of doing:
-        <code>
-          for p in group["params"]:
-             state = self.state[p]
-             ...
-        </code>
-        you can do:
-        <code>
-          with self.batched_params(group["params"]) as batches:
-             for p, state, p_names in batches:
-                 ...
-        </code>
-        Args:
-          group: a parameter group, which is a list of parameters; should be
-                one of self.param_groups.
-          group_params_names: name for each parameter in group,
-                which is List[str].
-        """
-        batches = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
-        batches_names = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of str
-        assert len(param_group) == len(group_params_names)
-        for p, named_p in zip(param_group, group_params_names):
-            key = (str(p.dtype), *p.shape)
-            batches[key].append(p)
-            batches_names[key].append(named_p)
-        batches_names_keys = list(batches_names.keys())
-        sorted_idx = sorted(
-            range(len(batches_names)), key=lambda i: batches_names_keys[i])
-        batches_names = [
-            batches_names[batches_names_keys[idx]] for idx in sorted_idx
-        ]
-        batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
-        stacked_params_dict = dict()
-        # turn batches into a list, in deterministic order.
-        # tuples will contain tuples of (stacked_param, state, stacked_params_names),
-        # one for each batch in `batches`.
-        tuples = []
-        for batch, batch_names in zip(batches, batches_names):
-            p = batch[0]
-            # we arbitrarily store the state in the
-            # state corresponding to the 1st parameter in the
-            # group.  class Optimizer will take care of saving/loading state.
-            state = self.state[p]
-            p_stacked = torch.stack(batch)
-            grad = torch.stack([
-                torch.zeros_like(p) if p.grad is None else p.grad for p in batch
-            ])
-            p_stacked.grad = grad
-            stacked_params_dict[key] = p_stacked
-            tuples.append((p_stacked, state, batch_names))
-        yield tuples  # <-- calling code will do the actual optimization here!
-        for ((stacked_params, _state, _names), batch) in zip(tuples, batches):
-            for i, p in enumerate(batch):  # batch is list of Parameter
-                p.copy_(stacked_params[i])
-class ScaledAdam(BatchedOptimizer):
-    """
-     Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
-     proportional to the norm of that parameter; and also learn the scale of the parameter,
-     in log space, subject to upper and lower limits (as if we had factored each parameter as
-     param = underlying_param * log_scale.exp())
-     Args:
-          params:  The parameters or param_groups to optimize (like other Optimizer subclasses)
-              lr:  The learning rate.  We will typically use a learning rate schedule that starts
-                   at 0.03 and decreases over time, i.e. much higher than other common
-                   optimizers.
-     clipping_scale: (e.g. 2.0)
-                   A scale for gradient-clipping: if specified, the normalized gradients
-                   over the whole model will be clipped to have 2-norm equal to
-                   `clipping_scale` times the median 2-norm over the most recent period
-                   of `clipping_update_period` minibatches.  By "normalized gradients",
-                   we mean after multiplying by the rms parameter value for this tensor
-                   [for non-scalars]; this is appropriate because our update is scaled
-                   by this quantity.
-            betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
-                   Must satisfy 0 < beta <= beta2 < 1.
-     scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
-                   scale of each parameter tensor and scalar parameters of the mode..
-                   If each parameter were decomposed
-                   as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
-                   would be a the scaling factor on the learning rate of p_scale.
-              eps:  A general-purpose epsilon to prevent division by zero
-    param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be >= this value)
-    param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be <= this value)
-       scalar_max: Maximum absolute value for scalar parameters (applicable if your
-                   model has any parameters with numel() == 1).
-    size_update_period: The periodicity, in steps, with which we update the size (scale)
-                   of the parameter tensor.  This is provided to save a little time
-                   in the update.
-     clipping_update_period: if clipping_scale is specified, this is the period
-    """
-    def __init__(
-            self,
-            params,
-            lr=3e-02,
-            clipping_scale=None,
-            betas=(0.9, 0.98),
-            scalar_lr_scale=0.1,
-            eps=1.0e-08,
-            param_min_rms=1.0e-05,
-            param_max_rms=3.0,
-            scalar_max=10.0,
-            size_update_period=4,
-            clipping_update_period=100,
-            parameters_names=None,
-            show_dominant_parameters=True, ):
-        assert parameters_names is not None, (
-            "Please prepare parameters_names,"
-            "which is a List[List[str]]. Each List[str] is for a group"
-            "and each str is for a parameter")
-        defaults = dict(
-            lr=lr,
-            clipping_scale=clipping_scale,
-            betas=betas,
-            scalar_lr_scale=scalar_lr_scale,
-            eps=eps,
-            param_min_rms=param_min_rms,
-            param_max_rms=param_max_rms,
-            scalar_max=scalar_max,
-            size_update_period=size_update_period,
-            clipping_update_period=clipping_update_period, )
-        super(ScaledAdam, self).__init__(params, defaults)
-        assert len(self.param_groups) == len(parameters_names)
-        self.parameters_names = parameters_names
-        self.show_dominant_parameters = show_dominant_parameters
-    def __setstate__(self, state):
-        super(ScaledAdam, self).__setstate__(state)
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        batch = True
-        for group, group_params_names in zip(self.param_groups,
-                                             self.parameters_names):
-            with self.batched_params(group["params"],
-                                     group_params_names) as batches:
-                # batches is list of pairs (stacked_param, state).  stacked_param is like
-                # a regular parameter, and will have a .grad, but the 1st dim corresponds to
-                # a stacking dim, it is not a real dim.
-                if (len(batches[0][1]) ==
-                        0):  # if len(first state) == 0: not yet initialized
-                    clipping_scale = 1
-                else:
-                    clipping_scale = self._get_clipping_scale(group, batches)
-                for p, state, _ in batches:
-                    # Perform optimization step.
-                    # grad is not going to be None, we handled that when creating the batches.
-                    grad = p.grad
-                    if grad.is_sparse:
-                        raise RuntimeError(
-                            "ScaledAdam optimizer does not support sparse gradients"
-                        )
-                    # State initialization
-                    if len(state) == 0:
-                        self._init_state(group, p, state)
-                    self._step_one_batch(group, p, state, clipping_scale)
-        return loss
-    def _init_state(self, group: dict, p: Tensor, state: dict):
-        """
-        Initializes state dict for parameter 'p'.  Assumes that dim 0 of tensor p
-        is actually the batch dimension, corresponding to batched-together
-        parameters of a given shape.
-        Args:
-           group:   Dict to look up configuration values.
-               p: The parameter that we are initializing the state for
-           state: Dict from string to whatever state we are initializing
-        """
-        size_update_period = group["size_update_period"]
-        state["step"] = 0
-        kwargs = {"device": p.device, "dtype": p.dtype}
-        # 'delta' implements conventional momentum.  There are
-        # several different kinds of update going on, so rather than
-        # compute "exp_avg" like in Adam, we store and decay a
-        # parameter-change "delta", which combines all forms of
-        # update.  this is equivalent to how it's done in Adam,
-        # except for the first few steps.
-        state["delta"] = torch.zeros_like(
-            p, memory_format=torch.preserve_format)
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-        numel = p.numel()
-        if numel > 1:
-            # "param_rms" just periodically records the scalar root-mean-square value of
-            # the parameter tensor.
-            # it has a shape like (batch_size, 1, 1, 1, 1)
-            param_rms = (
-                (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt())
-            state["param_rms"] = param_rms
-            state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
-            state["scale_grads"] = torch.zeros(size_update_period,
-                                               *param_rms.shape, **kwargs)
-        # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
-        state["exp_avg_sq"] = torch.zeros_like(
-            p, memory_format=torch.preserve_format)
-    def _get_clipping_scale(self,
-                            group: dict,
-                            tuples: List[Tuple[Tensor, dict, List[str]]]
-                            ) -> float:
-        """
-        Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
-        by this amount before applying the rest of the update.
-        Args:
-           group: the parameter group, an item in self.param_groups
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-        """
-        assert len(tuples) >= 1
-        clipping_scale = group["clipping_scale"]
-        (first_p, first_state, _) = tuples[0]
-        step = first_state["step"]
-        if clipping_scale is None or step == 0:
-            # no clipping.  return early on step == 0 because the other
-            # parameters' state won't have been initialized yet.
-            return 1.0
-        clipping_update_period = group["clipping_update_period"]
-        tot_sumsq = torch.tensor(0.0, device=first_p.device)
-        for (p, state, param_names) in tuples:
-            grad = p.grad
-            if grad.is_sparse:
-                raise RuntimeError(
-                    "ScaledAdam optimizer does not support sparse gradients")
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                tot_sumsq += (grad**2).sum()  # sum() to change shape [1] to []
-            else:
-                tot_sumsq += ((grad * state["param_rms"])**2).sum()
-        tot_norm = tot_sumsq.sqrt()
-        if "model_norms" not in first_state:
-            first_state["model_norms"] = torch.zeros(
-                clipping_update_period, device=p.device)
-        first_state["model_norms"][step % clipping_update_period] = tot_norm
-        if step % clipping_update_period == 0:
-            # Print some stats.
-            # We don't reach here if step == 0 because we would have returned
-            # above.
-            sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
-            quartiles = []
-            for n in range(0, 5):
-                index = min(
-                    clipping_update_period - 1,
-                    (clipping_update_period // 4) * n, )
-                quartiles.append(sorted_norms[index].item())
-            median = quartiles[2]
-            threshold = clipping_scale * median
-            first_state["model_norm_threshold"] = threshold
-            percent_clipped = (first_state["num_clipped"] * 100.0 /
-                               clipping_update_period
-                               if "num_clipped" in first_state else 0.0)
-            first_state["num_clipped"] = 0
-            quartiles = " ".join(["%.3e" % x for x in quartiles])
-            logging.info(
-                f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
-                f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
-            )
-        if step < clipping_update_period:
-            return 1.0  # We have not yet estimated a norm to clip to.
-        else:
-            try:
-                model_norm_threshold = first_state["model_norm_threshold"]
-            except KeyError:
-                logging.info(
-                    "Warning: model_norm_threshold not in state: possibly "
-                    "you changed config when restarting, adding clipping_scale option?"
-                )
-                return 1.0
-            ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
-            if ans < 1.0:
-                first_state["num_clipped"] += 1
-            if ans < 0.1:
-                logging.warn(
-                    f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
-                )
-                if self.show_dominant_parameters:
-                    assert p.shape[0] == len(param_names)
-                    self._show_gradient_dominating_parameter(tuples, tot_sumsq)
-            return ans
-    def _show_gradient_dominating_parameter(
-            self, tuples: List[Tuple[Tensor, dict, List[str]]],
-            tot_sumsq: Tensor):
-        """
-        Show information of parameter wihch dominanting tot_sumsq.
-        Args:
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-            tot_sumsq: sumsq of all parameters. Though it's could be calculated
-                from tuples, we still pass it to save some time.
-        """
-        all_sumsq_orig = {}
-        for (p, state, batch_param_names) in tuples:
-            # p is a stacked batch parameters.
-            batch_grad = p.grad
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                batch_sumsq_orig = batch_grad**2
-                # Dummpy values used by following `zip` statement.
-                batch_rms_orig = torch.ones(p.shape[0])
-            else:
-                batch_rms_orig = state["param_rms"]
-                batch_sumsq_orig = ((batch_grad * batch_rms_orig)**2).sum(
-                    dim=list(range(1, batch_grad.ndim)))
-            for name, sumsq_orig, rms, grad in zip(batch_param_names,
-                                                   batch_sumsq_orig,
-                                                   batch_rms_orig, batch_grad):
-                proportion_orig = sumsq_orig / tot_sumsq
-                all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
-        assert torch.isclose(
-            sum([value[0] for value in all_sumsq_orig.values()]).cpu(),
-            torch.tensor(1.0), )
-        sorted_by_proportion = {
-            k: v
-            for k, v in sorted(
-                all_sumsq_orig.items(),
-                key=lambda item: item[1][0],
-                reverse=True, )
-        }
-        dominant_param_name = next(iter(sorted_by_proportion))
-        (dominant_proportion, dominant_sumsq, dominant_rms,
-         dominant_grad, ) = sorted_by_proportion[dominant_param_name]
-        logging.info(f"Parameter Dominanting tot_sumsq {dominant_param_name}"
-                     f" with proportion {dominant_proportion:.2f},"
-                     f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
-                     f"={dominant_sumsq:.3e},"
-                     f" grad_sumsq = {(dominant_grad**2).sum():.3e},"
-                     f" orig_rms_sq={(dominant_rms**2).item():.3e}")
-    def _step_one_batch(self,
-                        group: dict,
-                        p: Tensor,
-                        state: dict,
-                        clipping_scale: float):
-        """
-        Do the step for one parameter, which is actually going to be a batch of
-        `real` parameters, with dim 0 as the batch dim.
-        Args:
-                  group:  dict to look up configuration values
-                    p: parameter to update (actually multiple parameters stacked together
-                       as a batch)
-                  state: state-dict for p, to look up the optimizer state
-        """
-        lr = group["lr"]
-        size_update_period = group["size_update_period"]
-        beta1 = group["betas"][0]
-        grad = p.grad
-        if clipping_scale != 1.0:
-            grad = grad * clipping_scale
-        step = state["step"]
-        delta = state["delta"]
-        delta.mul_(beta1)
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-        if numel > 1:
-            # Update the size/scale of p, and set param_rms
-            scale_grads = state["scale_grads"]
-            scale_grads[step % size_update_period] = (p * grad).sum(
-                dim=list(range(1, p.ndim)), keepdim=True)
-            if step % size_update_period == size_update_period - 1:
-                param_rms = state["param_rms"]  # shape: (batch_size, 1, 1, ..)
-                param_rms.copy_((p**2)
-                                .mean(dim=list(range(1, p.ndim)), keepdim=True)
-                                .sqrt())
-                if step > 0:
-                    # self._size_update() learns the overall scale on the
-                    # parameter, by shrinking or expanding it.
-                    self._size_update(group, scale_grads, p, state)
-        if numel == 1:
-            # For parameters with 1 element we just use regular Adam.
-            # Updates delta.
-            self._step_scalar(group, p, state)
-        else:
-            self._step(group, p, state)
-        state["step"] = step + 1
-    def _size_update(self,
-                     group: dict,
-                     scale_grads: Tensor,
-                     p: Tensor,
-                     state: dict) -> None:
-        """
-               Called only where p.numel() > 1, this updates the scale of the parameter.
-               If we imagine: p =  underlying_param * scale.exp(), and we are doing
-               gradient descent on underlying param and on scale, this function does the update
-               on `scale`.
-               Args:
-              group: dict to look up configuration values
-        scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
-                      grads w.r.t. the scales.
-                  p:  The parameter to update
-               state: The state-dict of p
-        """
-        param_rms = state["param_rms"]
-        beta1, beta2 = group["betas"]
-        size_lr = group["lr"] * group["scalar_lr_scale"]
-        param_min_rms = group["param_min_rms"]
-        param_max_rms = group["param_max_rms"]
-        eps = group["eps"]
-        step = state["step"]
-        batch_size = p.shape[0]
-        size_update_period = scale_grads.shape[0]
-        # correct beta2 for the size update period: we will have
-        # faster decay at this level.
-        beta2_corr = beta2**size_update_period
-        scale_exp_avg_sq = state[
-            "scale_exp_avg_sq"]  # shape: (batch_size, 1, 1, ..)
-        scale_exp_avg_sq.mul_(beta2_corr).add_(
-            (scale_grads**2).mean(dim=0),  # mean over dim `size_update_period`
-            alpha=1 - beta2_corr, )  # shape is (batch_size, 1, 1, ...)
-        # The 1st time we reach here is when size_step == 1.
-        size_step = (step + 1) // size_update_period
-        bias_correction2 = 1 - beta2_corr**size_step
-        # we don't bother with bias_correction1; this will help prevent divergence
-        # at the start of training.
-        denom = scale_exp_avg_sq.sqrt() + eps
-        scale_step = (-size_lr * (bias_correction2**0.5) *
-                      scale_grads.sum(dim=0) / denom)
-        is_too_small = param_rms < param_min_rms
-        is_too_large = param_rms > param_max_rms
-        # when the param gets too small, just don't shrink it any further.
-        scale_step.masked_fill_(is_too_small, 0.0)
-        # when it gets too large, stop it from getting any larger.
-        scale_step.masked_fill_(is_too_large, -size_lr * size_update_period)
-        delta = state["delta"]
-        # the factor of (1-beta1) relates to momentum.
-        delta.add_(p * scale_step, alpha=(1 - beta1))
-    def _step(self, group: dict, p: Tensor, state: dict):
-        """
-        This function does the core update of self.step(), in the case where the members of
-        the batch have more than 1 element.
-        Args:
-            group: A dict which will be used to look up configuration values
-                p: The parameter to be updated
-             grad: The grad of p
-            state: The state-dict corresponding to parameter p
-        This function modifies p.
-        """
-        grad = p.grad
-        lr = group["lr"]
-        beta1, beta2 = group["betas"]
-        eps = group["eps"]
-        param_min_rms = group["param_min_rms"]
-        step = state["step"]
-        exp_avg_sq = state["exp_avg_sq"]
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
-        this_step = state["step"] - (state["zero_step"]
-                                     if "zero_step" in state else 0)
-        bias_correction2 = 1 - beta2**(this_step + 1)
-        if bias_correction2 < 0.99:
-            # note: not in-place.
-            exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
-        denom = exp_avg_sq.sqrt()
-        denom += eps
-        grad = grad / denom
-        alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
-        delta = state["delta"]
-        delta.add_(grad * alpha)
-        p.add_(delta)
-    def _step_scalar(self, group: dict, p: Tensor, state: dict):
-        """
-        A simplified form of the core update for scalar tensors, where we cannot get a good
-        estimate of the parameter rms.
-        """
-        beta1, beta2 = group["betas"]
-        scalar_max = group["scalar_max"]
-        eps = group["eps"]
-        lr = group["lr"] * group["scalar_lr_scale"]
-        grad = p.grad
-        exp_avg_sq = state["exp_avg_sq"]  # shape: (batch_size,)
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-        # bias_correction2 is like in Adam.  Don't bother with bias_correction1;
-        # slower update at the start will help stability anyway.
-        bias_correction2 = 1 - beta2**(state["step"] + 1)
-        denom = (exp_avg_sq / bias_correction2).sqrt() + eps
-        delta = state["delta"]
-        delta.add_(grad / denom, alpha=-lr * (1 - beta1))
-        p.clamp_(min=-scalar_max, max=scalar_max)
-        p.add_(delta)

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/patched_mha_with_cache.py DELETED Viewed

@@ -1,388 +0,0 @@
-from torch.nn.functional import *
-from torch.nn.functional import _mha_shape_check,_canonical_mask,_none_or_dtype,_in_projection_packed
-# import torch
-# Tensor = torch.Tensor
-# from typing import Callable, List, Optional, Tuple, Union
-def multi_head_attention_forward_patched(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    embed_dim_to_check: int,
-    num_heads: int,
-    in_proj_weight: Optional[Tensor],
-    in_proj_bias: Optional[Tensor],
-    bias_k: Optional[Tensor],
-    bias_v: Optional[Tensor],
-    add_zero_attn: bool,
-    dropout_p: float,
-    out_proj_weight: Tensor,
-    out_proj_bias: Optional[Tensor],
-    training: bool = True,
-    key_padding_mask: Optional[Tensor] = None,
-    need_weights: bool = True,
-    attn_mask: Optional[Tensor] = None,
-    use_separate_proj_weight: bool = False,
-    q_proj_weight: Optional[Tensor] = None,
-    k_proj_weight: Optional[Tensor] = None,
-    v_proj_weight: Optional[Tensor] = None,
-    static_k: Optional[Tensor] = None,
-    static_v: Optional[Tensor] = None,
-    average_attn_weights: bool = True,
-    is_causal: bool = False,cache=None
-) -> Tuple[Tensor, Optional[Tensor]]:
-    r"""
-    Args:
-        query, key, value: map a query and a set of key-value pairs to an output.
-            See "Attention Is All You Need" for more details.
-        embed_dim_to_check: total dimension of the model.
-        num_heads: parallel attention heads.
-        in_proj_weight, in_proj_bias: input projection weight and bias.
-        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
-        add_zero_attn: add a new batch of zeros to the key and
-                       value sequences at dim=1.
-        dropout_p: probability of an element to be zeroed.
-        out_proj_weight, out_proj_bias: the output projection weight and bias.
-        training: apply dropout if is ``True``.
-        key_padding_mask: if provided, specified padding elements in the key will
-            be ignored by the attention. This is an binary mask. When the value is True,
-            the corresponding value on the attention layer will be filled with -inf.
-        need_weights: output attn_output_weights.
-            Default: `True`
-            Note: `needs_weight` defaults to `True`, but should be set to `False`
-            For best performance when attention weights are not nedeeded.
-            *Setting needs_weights to `True`
-            leads to a significant performance degradation.*
-        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
-            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
-        is_causal: If specified, applies a causal mask as attention mask, and ignores
-            attn_mask for computing scaled dot product attention.
-            Default: ``False``.
-            .. warning::
-                is_causal is provides a hint that the attn_mask is the
-                causal mask.Providing incorrect hints can result in
-                incorrect execution, including forward and backward
-                compatibility.
-        use_separate_proj_weight: the function accept the proj. weights for query, key,
-            and value in different forms. If false, in_proj_weight will be used, which is
-            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
-        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
-        static_k, static_v: static key and value used for attention operators.
-        average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across heads.
-            Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an effect
-            when ``need_weights=True.``. Default: True
-    Shape:
-        Inputs:
-        - query: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
-          the embedding dimension.
-        - key: :math:`(S, E)` or :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
-          the embedding dimension.
-        - value: :math:`(S, E)` or :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
-          the embedding dimension.
-        - key_padding_mask: :math:`(S)` or :math:`(N, S)` where N is the batch size, S is the source sequence length.
-          If a FloatTensor is provided, it will be directly added to the value.
-          If a BoolTensor is provided, the positions with the
-          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
-        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
-          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
-          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
-          positions. If a BoolTensor is provided, positions with ``True``
-          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
-          is provided, it will be added to the attention weight.
-        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
-          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
-        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
-          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
-        Outputs:
-        - attn_output: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
-          E is the embedding dimension.
-        - attn_output_weights: Only returned when ``need_weights=True``. If ``average_attn_weights=True``, returns
-          attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
-          :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
-          :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
-          head of shape :math:`(num_heads, L, S)` when input is unbatched or :math:`(N, num_heads, L, S)`.
-    """
-    tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
-    if has_torch_function(tens_ops):
-        return handle_torch_function(
-            multi_head_attention_forward,
-            tens_ops,
-            query,
-            key,
-            value,
-            embed_dim_to_check,
-            num_heads,
-            in_proj_weight,
-            in_proj_bias,
-            bias_k,
-            bias_v,
-            add_zero_attn,
-            dropout_p,
-            out_proj_weight,
-            out_proj_bias,
-            training=training,
-            key_padding_mask=key_padding_mask,
-            need_weights=need_weights,
-            attn_mask=attn_mask,
-            is_causal=is_causal,
-            use_separate_proj_weight=use_separate_proj_weight,
-            q_proj_weight=q_proj_weight,
-            k_proj_weight=k_proj_weight,
-            v_proj_weight=v_proj_weight,
-            static_k=static_k,
-            static_v=static_v,
-            average_attn_weights=average_attn_weights,cache=cache
-        )
-    is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
-    # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
-    # is batched, run the computation and before returning squeeze the
-    # batch dimension so that the output doesn't carry this temporary batch dimension.
-    if not is_batched:
-        # unsqueeze if the input is unbatched
-        query = query.unsqueeze(1)
-        key = key.unsqueeze(1)
-        value = value.unsqueeze(1)
-        if key_padding_mask is not None:
-            key_padding_mask = key_padding_mask.unsqueeze(0)
-    # set up shape vars
-    tgt_len, bsz, embed_dim = query.shape
-    src_len, _, _ = key.shape
-    key_padding_mask = _canonical_mask(
-        mask=key_padding_mask,
-        mask_name="key_padding_mask",
-        other_type=_none_or_dtype(attn_mask),
-        other_name="attn_mask",
-        target_type=query.dtype
-    )
-    if is_causal and attn_mask is None:
-        raise RuntimeError(
-            "Need attn_mask if specifying the is_causal hint. "
-            "You may use the Transformer module method "
-            "`generate_square_subsequent_mask` to create this mask."
-        )
-    if is_causal and key_padding_mask is None and not need_weights:
-        # when we have a kpm or need weights, we need attn_mask
-        # Otherwise, we use the is_causal hint go as is_causal
-        # indicator to SDPA.
-        attn_mask = None
-    else:
-        attn_mask = _canonical_mask(
-            mask=attn_mask,
-            mask_name="attn_mask",
-            other_type=None,
-            other_name="",
-            target_type=query.dtype,
-            check_other=False,
-        )
-        if key_padding_mask is not None:
-            # We have the attn_mask, and use that to merge kpm into it.
-            # Turn off use of is_causal hint, as the merged mask is no
-            # longer causal.
-            is_causal = False
-    assert embed_dim == embed_dim_to_check, \
-        f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
-    if isinstance(embed_dim, torch.Tensor):
-        # embed_dim can be a tensor when JIT tracing
-        head_dim = embed_dim.div(num_heads, rounding_mode='trunc')
-    else:
-        head_dim = embed_dim // num_heads
-    assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
-    if use_separate_proj_weight:
-        # allow MHA to have different embedding dimensions when separate projection weights are used
-        assert key.shape[:2] == value.shape[:2], \
-            f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
-    else:
-        assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
-    #
-    # compute in-projection
-    #
-    if not use_separate_proj_weight:
-        assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
-        q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
-    else:
-        assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
-        assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
-        assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
-        if in_proj_bias is None:
-            b_q = b_k = b_v = None
-        else:
-            b_q, b_k, b_v = in_proj_bias.chunk(3)
-        q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
-    if(cache!=None):
-        if(cache["first_infer"]==1):
-            cache["k"][cache["stage"]]=k
-            # print(0,cache["k"].shape)
-            cache["v"][cache["stage"]]=v
-        else:###12个layer每个都要留自己的cache_kv
-            # print(1,cache["k"].shape)
-            cache["k"][cache["stage"]]=torch.cat([cache["k"][cache["stage"]],k],0)##本来时序是1，但是proj的时候可能transpose了所以时序到0维了
-            cache["v"][cache["stage"]]=torch.cat([cache["v"][cache["stage"]],v],0)
-            # print(2, cache["k"].shape)
-            src_len = cache["k"][cache["stage"]].shape[0]
-            k=cache["k"][cache["stage"]]
-            v=cache["v"][cache["stage"]]
-            # if attn_mask is not None:
-            #     attn_mask=attn_mask[-1:,]
-                # print(attn_mask.shape,attn_mask)
-        cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
-    # print(2333,cache)
-    # prep attention mask
-    attn_mask = _canonical_mask(
-        mask=attn_mask,
-        mask_name="attn_mask",
-        other_type=None,
-        other_name="",
-        target_type=q.dtype,
-        check_other=False,
-    )
-    if attn_mask is not None:
-        # ensure attn_mask's dim is 3
-        if attn_mask.dim() == 2:
-            correct_2d_size = (tgt_len, src_len)
-            if attn_mask.shape != correct_2d_size:
-                raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
-            attn_mask = attn_mask.unsqueeze(0)
-        elif attn_mask.dim() == 3:
-            correct_3d_size = (bsz * num_heads, tgt_len, src_len)
-            if attn_mask.shape != correct_3d_size:
-                raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.")
-        else:
-            raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
-    # add bias along batch dimension (currently second)
-    if bias_k is not None and bias_v is not None:
-        assert static_k is None, "bias cannot be added to static key."
-        assert static_v is None, "bias cannot be added to static value."
-        k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
-        v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
-        if attn_mask is not None:
-            attn_mask = pad(attn_mask, (0, 1))
-        if key_padding_mask is not None:
-            key_padding_mask = pad(key_padding_mask, (0, 1))
-    else:
-        assert bias_k is None
-        assert bias_v is None
-    #
-    # reshape q, k, v for multihead attention and make em batch first
-    #
-    q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
-    if static_k is None:
-        k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
-    else:
-        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
-        assert static_k.size(0) == bsz * num_heads, \
-            f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
-        assert static_k.size(2) == head_dim, \
-            f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
-        k = static_k
-    if static_v is None:
-        v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
-    else:
-        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
-        assert static_v.size(0) == bsz * num_heads, \
-            f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
-        assert static_v.size(2) == head_dim, \
-            f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
-        v = static_v
-    # add zero attention along batch dimension (now first)
-    if add_zero_attn:
-        zero_attn_shape = (bsz * num_heads, 1, head_dim)
-        k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
-        v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1)
-        if attn_mask is not None:
-            attn_mask = pad(attn_mask, (0, 1))
-        if key_padding_mask is not None:
-            key_padding_mask = pad(key_padding_mask, (0, 1))
-    # update source sequence length after adjustments
-    src_len = k.size(1)
-    # merge key padding and attention masks
-    if key_padding_mask is not None:
-        assert key_padding_mask.shape == (bsz, src_len), \
-            f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
-        key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).   \
-            expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
-        if attn_mask is None:
-            attn_mask = key_padding_mask
-        else:
-            attn_mask = attn_mask + key_padding_mask
-    # adjust dropout probability
-    if not training:
-        dropout_p = 0.0
-    #
-    # (deep breath) calculate attention and out projection
-    #
-    if need_weights:
-        B, Nt, E = q.shape
-        q_scaled = q / math.sqrt(E)
-        assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"
-        if attn_mask is not None:
-            attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
-        else:
-            attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
-        attn_output_weights = softmax(attn_output_weights, dim=-1)
-        if dropout_p > 0.0:
-            attn_output_weights = dropout(attn_output_weights, p=dropout_p)
-        attn_output = torch.bmm(attn_output_weights, v)
-        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
-        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
-        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
-        # optionally average attention weights over heads
-        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
-        if average_attn_weights:
-            attn_output_weights = attn_output_weights.mean(dim=1)
-        if not is_batched:
-            # squeeze the output if input was unbatched
-            attn_output = attn_output.squeeze(1)
-            attn_output_weights = attn_output_weights.squeeze(0)
-        return attn_output, attn_output_weights
-    else:
-        # attn_mask can be either (L,S) or (N*num_heads, L, S)
-        # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S)
-        # in order to match the input for SDPA of (N, num_heads, L, S)
-        if attn_mask is not None:
-            if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
-                attn_mask = attn_mask.unsqueeze(0)
-            else:
-                attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
-        q = q.view(bsz, num_heads, tgt_len, head_dim)
-        k = k.view(bsz, num_heads, src_len, head_dim)
-        v = v.view(bsz, num_heads, src_len, head_dim)
-        attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
-        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
-        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
-        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
-        if not is_batched:
-            # squeeze the output if input was unbatched
-            attn_output = attn_output.squeeze(1)
-        return attn_output, None

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/scaling.py DELETED Viewed

@@ -1,319 +0,0 @@
-# Copyright    2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import math
-import random
-from typing import Optional
-from typing import Tuple
-from typing import Union
-import torch
-import torch.nn as nn
-from torch import Tensor
-class DoubleSwishFunction(torch.autograd.Function):
-    """
-      double_swish(x) = x * torch.sigmoid(x-1)
-    This is a definition, originally motivated by its close numerical
-    similarity to swish(swish(x)), where swish(x) =  x * sigmoid(x).
-    Memory-efficient derivative computation:
-     double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
-     double_swish'(x) = d/dx double_swish(x) =  x * s'(x) + x' * s(x) = x * s'(x) + s(x).
-     Now, s'(x) = s(x) * (1-s(x)).
-     double_swish'(x) =  x * s'(x) + s(x).
-                      =  x * s(x) * (1-s(x)) + s(x).
-                     = double_swish(x) * (1-s(x)) + s(x)
-     ... so we just need to remember s(x) but not x itself.
-    """
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        requires_grad = x.requires_grad
-        x_dtype = x.dtype
-        if x.dtype == torch.float16:
-            x = x.to(torch.float32)
-        s = torch.sigmoid(x - 1.0)
-        y = x * s
-        if requires_grad:
-            deriv = y * (1 - s) + s
-            # notes on derivative of x * sigmoid(x - 1):
-            # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29
-            # min \simeq -0.043638.  Take floor as -0.043637 so it's a lower bund
-            # max \simeq 1.1990.   Take ceil to be 1.2 so it's an upper bound.
-            # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which
-            # floors), should be expectation-preserving.
-            floor = -0.043637
-            ceil = 1.2
-            d_scaled = (deriv - floor) * (255.0 / (ceil - floor)
-                                          ) + torch.rand_like(deriv)
-            if __name__ == "__main__":
-                # for self-testing only.
-                assert d_scaled.min() >= 0.0
-                assert d_scaled.max() < 256.0
-            d_int = d_scaled.to(torch.uint8)
-            ctx.save_for_backward(d_int)
-        if x.dtype == torch.float16 or torch.is_autocast_enabled():
-            y = y.to(torch.float16)
-        return y
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        (d, ) = ctx.saved_tensors
-        # the same constants as used in forward pass.
-        floor = -0.043637
-        ceil = 1.2
-        d = d * ((ceil - floor) / 255.0) + floor
-        return y_grad * d
-class DoubleSwish(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
-        that we approximate closely with x * sigmoid(x-1).
-        """
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            return x * torch.sigmoid(x - 1.0)
-        return DoubleSwishFunction.apply(x)
-class ActivationBalancerFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-            ctx,
-            x: Tensor,
-            scale_factor: Tensor,
-            sign_factor: Optional[Tensor],
-            channel_dim: int, ) -> Tensor:
-        if channel_dim < 0:
-            channel_dim += x.ndim
-        ctx.channel_dim = channel_dim
-        xgt0 = x > 0
-        if sign_factor is None:
-            ctx.save_for_backward(xgt0, scale_factor)
-        else:
-            ctx.save_for_backward(xgt0, scale_factor, sign_factor)
-        return x
-    @staticmethod
-    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]:
-        if len(ctx.saved_tensors) == 3:
-            xgt0, scale_factor, sign_factor = ctx.saved_tensors
-            for _ in range(ctx.channel_dim, x_grad.ndim - 1):
-                scale_factor = scale_factor.unsqueeze(-1)
-                sign_factor = sign_factor.unsqueeze(-1)
-            factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
-        else:
-            xgt0, scale_factor = ctx.saved_tensors
-            for _ in range(ctx.channel_dim, x_grad.ndim - 1):
-                scale_factor = scale_factor.unsqueeze(-1)
-            factor = scale_factor * (xgt0.to(x_grad.dtype) - 0.5)
-        neg_delta_grad = x_grad.abs() * factor
-        return (x_grad - neg_delta_grad, None, None, None, )
-def _compute_scale_factor(
-        x: Tensor,
-        channel_dim: int,
-        min_abs: float,
-        max_abs: float,
-        gain_factor: float,
-        max_factor: float, ) -> Tensor:
-    if channel_dim < 0:
-        channel_dim += x.ndim
-    sum_dims = [d for d in range(x.ndim) if d != channel_dim]
-    x_abs_mean = torch.mean(x.abs(), dim=sum_dims).to(torch.float32)
-    if min_abs == 0.0:
-        below_threshold = 0.0
-    else:
-        # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if
-        # x_abs)_mean , min_abs.
-        below_threshold = (
-            (min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp(
-                min=0, max=max_factor)
-    above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp(
-        min=0, max=max_factor)
-    return below_threshold - above_threshold
-def _compute_sign_factor(
-        x: Tensor,
-        channel_dim: int,
-        min_positive: float,
-        max_positive: float,
-        gain_factor: float,
-        max_factor: float, ) -> Tensor:
-    if channel_dim < 0:
-        channel_dim += x.ndim
-    sum_dims = [d for d in range(x.ndim) if d != channel_dim]
-    proportion_positive = torch.mean((x > 0).to(torch.float32), dim=sum_dims)
-    if min_positive == 0.0:
-        factor1 = 0.0
-    else:
-        # 0 if proportion_positive >= min_positive, else can be
-        # as large as max_factor.
-        factor1 = ((min_positive - proportion_positive) *
-                   (gain_factor / min_positive)).clamp_(
-                       min=0, max=max_factor)
-    if max_positive == 1.0:
-        factor2 = 0.0
-    else:
-        # 0 if self.proportion_positive <= max_positive, else can be
-        # as large as -max_factor.
-        factor2 = ((proportion_positive - max_positive) *
-                   (gain_factor / (1.0 - max_positive))).clamp_(
-                       min=0, max=max_factor)
-    sign_factor = factor1 - factor2
-    # require min_positive != 0 or max_positive != 1:
-    assert not isinstance(sign_factor, float)
-    return sign_factor
-class ActivationBalancer(torch.nn.Module):
-    """
-    Modifies the backpropped derivatives of a function to try to encourage, for
-    each channel, that it is positive at least a proportion `threshold` of the
-    time.  It does this by multiplying negative derivative values by up to
-    (1+max_factor), and positive derivative values by up to (1-max_factor),
-    interpolated from 1 at the threshold to those extremal values when none
-    of the inputs are positive.
-    Args:
-           num_channels: the number of channels
-           channel_dim: the dimension/axis corresponding to the channel, e.g.
-               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
-           min_positive: the minimum, per channel, of the proportion of the time
-               that (x > 0), below which we start to modify the derivatives.
-           max_positive: the maximum, per channel, of the proportion of the time
-               that (x > 0), above which we start to modify the derivatives.
-           max_factor: the maximum factor by which we modify the derivatives for
-              either the sign constraint or the magnitude constraint;
-              e.g. with max_factor=0.02, the the derivatives would be multiplied by
-              values in the range [0.98..1.02].
-           sign_gain_factor: determines the 'gain' with which we increase the
-              change in gradient once the constraints on min_positive and max_positive
-              are violated.
-           scale_gain_factor: determines the 'gain' with which we increase the
-              change in gradient once the constraints on min_abs and max_abs
-              are violated.
-           min_abs:  the minimum average-absolute-value difference from the mean
-               value per channel, which we allow, before we start to modify
-               the derivatives to prevent this.
-           max_abs:  the maximum average-absolute-value difference from the mean
-               value per channel, which we allow, before we start to modify
-               the derivatives to prevent this.
-          min_prob: determines the minimum probability with which we modify the
-             gradients for the {min,max}_positive and {min,max}_abs constraints,
-             on each forward().  This is done randomly to prevent all layers
-             from doing it at the same time.  Early in training we may use
-             higher probabilities than this; it will decay to this value.
-    """
-    def __init__(
-            self,
-            num_channels: int,
-            channel_dim: int,
-            min_positive: float=0.05,
-            max_positive: float=0.95,
-            max_factor: float=0.04,
-            sign_gain_factor: float=0.01,
-            scale_gain_factor: float=0.02,
-            min_abs: float=0.2,
-            max_abs: float=100.0,
-            min_prob: float=0.1, ):
-        super(ActivationBalancer, self).__init__()
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        self.min_positive = min_positive
-        self.max_positive = max_positive
-        self.max_factor = max_factor
-        self.min_abs = min_abs
-        self.max_abs = max_abs
-        self.min_prob = min_prob
-        self.sign_gain_factor = sign_gain_factor
-        self.scale_gain_factor = scale_gain_factor
-        # count measures how many times the forward() function has been called.
-        # We occasionally sync this to a tensor called `count`, that exists to
-        # make sure it is synced to disk when we load and save the model.
-        self.cpu_count = 0
-        self.register_buffer("count", torch.tensor(0, dtype=torch.int64))
-    def forward(self, x: Tensor) -> Tensor:
-        if (torch.jit.is_scripting() or not x.requires_grad or
-                torch.jit.is_tracing()):
-            return _no_op(x)
-        count = self.cpu_count
-        self.cpu_count += 1
-        if random.random() < 0.01:
-            # Occasionally sync self.cpu_count with self.count.
-            # count affects the decay of 'prob'.  don't do this on every iter,
-            # because syncing with the GPU is slow.
-            self.cpu_count = max(self.cpu_count, self.count.item())
-            self.count.fill_(self.cpu_count)
-        # the prob of doing some work exponentially decreases from 0.5 till it hits
-        # a floor at min_prob (==0.1, by default)
-        prob = max(self.min_prob, 0.5**(1 + (count / 4000.0)))
-        if random.random() < prob:
-            sign_gain_factor = 0.5
-            if self.min_positive != 0.0 or self.max_positive != 1.0:
-                sign_factor = _compute_sign_factor(
-                    x,
-                    self.channel_dim,
-                    self.min_positive,
-                    self.max_positive,
-                    gain_factor=self.sign_gain_factor / prob,
-                    max_factor=self.max_factor, )
-            else:
-                sign_factor = None
-            scale_factor = _compute_scale_factor(
-                x.detach(),
-                self.channel_dim,
-                min_abs=self.min_abs,
-                max_abs=self.max_abs,
-                gain_factor=self.scale_gain_factor / prob,
-                max_factor=self.max_factor, )
-            return ActivationBalancerFunction.apply(
-                x,
-                scale_factor,
-                sign_factor,
-                self.channel_dim, )
-        else:
-            return _no_op(x)
-def BalancedDoubleSwish(d_model, channel_dim=-1, max_abs=10.0,
-                        min_prob=0.25) -> nn.Sequential:
-    """
-    ActivationBalancer -> DoubleSwish
-    """
-    balancer = ActivationBalancer(
-        d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob)
-    return nn.Sequential(
-        balancer,
-        DoubleSwish(), )

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/modules/transformer.py DELETED Viewed

@@ -1,347 +0,0 @@
-# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py
-import copy
-import numbers
-from functools import partial
-from typing import Any
-from typing import Callable
-from typing import List
-from typing import Optional
-from typing import Tuple
-from typing import Union
-import torch
-from AR.modules.activation import MultiheadAttention
-from AR.modules.scaling import BalancedDoubleSwish
-from torch import nn
-from torch import Tensor
-from torch.nn import functional as F
-_shape_t = Union[int, List[int], torch.Size]
-class LayerNorm(nn.Module):
-    __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
-    normalized_shape: Tuple[int, ...]
-    eps: float
-    elementwise_affine: bool
-    def __init__(
-            self,
-            normalized_shape: _shape_t,
-            eps: float=1e-5,
-            elementwise_affine: bool=True,
-            device=None,
-            dtype=None, ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super(LayerNorm, self).__init__()
-        if isinstance(normalized_shape, numbers.Integral):
-            # mypy error: incompatible types in assignment
-            normalized_shape = (normalized_shape, )  # type: ignore[assignment]
-        self.normalized_shape = tuple(
-            normalized_shape)  # type: ignore[arg-type]
-        self.eps = eps
-        self.elementwise_affine = elementwise_affine
-        if self.elementwise_affine:
-            self.weight = nn.Parameter(
-                torch.empty(self.normalized_shape, **factory_kwargs))
-            self.bias = nn.Parameter(
-                torch.empty(self.normalized_shape, **factory_kwargs))
-        else:
-            self.register_parameter("weight", None)
-            self.register_parameter("bias", None)
-        self.reset_parameters()
-    def reset_parameters(self) -> None:
-        if self.elementwise_affine:
-            nn.init.ones_(self.weight)
-            nn.init.zeros_(self.bias)
-    def forward(self, input: Tensor, embedding: Any=None) -> Tensor:
-        if isinstance(input, tuple):
-            input, embedding = input
-            return (F.layer_norm(
-                input,
-                self.normalized_shape,
-                self.weight,
-                self.bias,
-                self.eps, ), embedding, )
-        assert embedding is None
-        return F.layer_norm(input, self.normalized_shape, self.weight,
-                            self.bias, self.eps)
-    def extra_repr(self) -> str:
-        return (
-            "{normalized_shape}, eps={eps}, "
-            "elementwise_affine={elementwise_affine}".format(**self.__dict__))
-class IdentityNorm(nn.Module):
-    def __init__(
-            self,
-            d_model: int,
-            eps: float=1e-5,
-            device=None,
-            dtype=None, ) -> None:
-        super(IdentityNorm, self).__init__()
-    def forward(self, input: Tensor, embedding: Any=None) -> Tensor:
-        if isinstance(input, tuple):
-            return input
-        assert embedding is None
-        return input
-class TransformerEncoder(nn.Module):
-    r"""TransformerEncoder is a stack of N encoder layers. Users can build the
-    BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
-    Args:
-        encoder_layer: an instance of the TransformerEncoderLayer() class (required).
-        num_layers: the number of sub-encoder-layers in the encoder (required).
-        norm: the layer normalization component (optional).
-        enable_nested_tensor: if True, input will automatically convert to nested tensor
-            (and convert back on output). This will improve the overall performance of
-            TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
-    Examples::
-        >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
-        >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6)
-        >>> src = torch.rand(10, 32, 512)
-        >>> out = transformer_encoder(src)
-    """
-    __constants__ = ["norm"]
-    def __init__(self, encoder_layer, num_layers, norm=None):
-        super(TransformerEncoder, self).__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-    def forward(
-            self,
-            src: Tensor,
-            mask: Optional[Tensor]=None,
-            src_key_padding_mask: Optional[Tensor]=None,
-            return_layer_states: bool=False,cache=None ) -> Tensor:
-        r"""Pass the input through the encoder layers in turn.
-        Args:
-            src: the sequence to the encoder (required).
-            mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional).
-            return_layer_states: return layers' state (optional).
-        Shape:
-            see the docs in Transformer class.
-        """
-        if return_layer_states:
-            layer_states = []  # layers' output
-            output = src
-            for mod in self.layers:
-                output = mod(
-                    output,
-                    src_mask=mask,
-                    src_key_padding_mask=src_key_padding_mask, cache=cache)
-                layer_states.append(output[0])
-            if self.norm is not None:
-                output = self.norm(output)
-            return layer_states, output
-        output = src
-        for mod in self.layers:
-            output = mod(output,
-                         src_mask=mask,
-                         src_key_padding_mask=src_key_padding_mask, cache=cache)
-        if self.norm is not None:
-            output = self.norm(output)
-        return output
-class TransformerEncoderLayer(nn.Module):
-    __constants__ = ["batch_first", "norm_first"]
-    def __init__(
-            self,
-            d_model: int,
-            nhead: int,
-            dim_feedforward: int=2048,
-            dropout: float=0.1,
-            activation: Union[str, Callable[[Tensor], Tensor]]=F.relu,
-            batch_first: bool=False,
-            norm_first: bool=False,
-            device=None,
-            dtype=None,
-            linear1_self_attention_cls: nn.Module=nn.Linear,
-            linear2_self_attention_cls: nn.Module=nn.Linear,
-            linear1_feedforward_cls: nn.Module=nn.Linear,
-            linear2_feedforward_cls: nn.Module=nn.Linear,
-            layer_norm_cls: nn.Module=LayerNorm,
-            layer_norm_eps: float=1e-5,
-            adaptive_layer_norm=False, ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super(TransformerEncoderLayer, self).__init__()
-        # print(233333333333,d_model,nhead)
-        # import os
-        # os._exit(2333333)
-        self.self_attn = MultiheadAttention(
-            d_model,#512 16
-            nhead,
-            dropout=dropout,
-            batch_first=batch_first,
-            linear1_cls=linear1_self_attention_cls,
-            linear2_cls=linear2_self_attention_cls,
-            **factory_kwargs, )
-        # Implementation of Feedforward model
-        self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward,
-                                               **factory_kwargs)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model,
-                                               **factory_kwargs)
-        self.norm_first = norm_first
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        # Legacy string support for activation function.
-        if isinstance(activation, str):
-            activation = _get_activation_fn(activation)
-        elif isinstance(activation, partial):
-            activation = activation(d_model)
-        elif activation == BalancedDoubleSwish:
-            activation = BalancedDoubleSwish(d_model)
-        # # We can't test self.activation in forward() in TorchScript,
-        # # so stash some information about it instead.
-        # if activation is F.relu or isinstance(activation, torch.nn.ReLU):
-        #     self.activation_relu_or_gelu = 1
-        # elif activation is F.gelu or isinstance(activation, torch.nn.GELU):
-        #     self.activation_relu_or_gelu = 2
-        # else:
-        #     self.activation_relu_or_gelu = 0
-        self.activation = activation
-        norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
-        if layer_norm_cls == IdentityNorm:
-            norm2 = BalancedBasicNorm(
-                d_model, eps=layer_norm_eps, **factory_kwargs)
-        else:
-            norm2 = layer_norm_cls(
-                d_model, eps=layer_norm_eps, **factory_kwargs)
-        if adaptive_layer_norm:
-            self.norm1 = AdaptiveLayerNorm(d_model, norm1)
-            self.norm2 = AdaptiveLayerNorm(d_model, norm2)
-        else:
-            self.norm1 = norm1
-            self.norm2 = norm2
-    def __setstate__(self, state):
-        super(TransformerEncoderLayer, self).__setstate__(state)
-        if not hasattr(self, "activation"):
-            self.activation = F.relu
-    def forward(
-            self,
-            src: Tensor,
-            src_mask: Optional[Tensor]=None,
-            src_key_padding_mask: Optional[Tensor]=None,cache=None ) -> Tensor:
-        r"""Pass the input through the encoder layer.
-        Args:
-            src: the sequence to the encoder layer (required).
-            src_mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional).
-        Shape:
-            see the docs in Transformer class.
-        """
-        x, stage_embedding = src, None
-        is_src_tuple = False
-        if isinstance(src, tuple):
-            x, stage_embedding = src
-            is_src_tuple = True
-        if src_key_padding_mask is not None:
-            _skpm_dtype = src_key_padding_mask.dtype
-            if _skpm_dtype != torch.bool and not torch.is_floating_point(
-                    src_key_padding_mask):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported"
-                )
-        if self.norm_first:
-            x = x + self._sa_block(
-                self.norm1(x, stage_embedding),
-                src_mask,
-                src_key_padding_mask,cache=cache )
-            x = x + self._ff_block(self.norm2(x, stage_embedding))
-        else:
-            x = self.norm1(
-                x + self._sa_block(x, src_mask, src_key_padding_mask,cache=cache),
-                stage_embedding, )
-            x = self.norm2(x + self._ff_block(x), stage_embedding)
-        if is_src_tuple:
-            return (x, stage_embedding)
-        return x
-    # self-attention block
-    def _sa_block(
-            self,
-            x: Tensor,
-            attn_mask: Optional[Tensor],
-            key_padding_mask: Optional[Tensor],cache=None ) -> Tensor:
-        # print(x.shape,attn_mask.shape,key_padding_mask)
-        #torch.Size([1, 188, 512]) torch.Size([188, 188]) None
-        # import os
-        # os._exit(23333)
-        x = self.self_attn(
-            x,
-            x,
-            x,
-            attn_mask=attn_mask,
-            key_padding_mask=key_padding_mask,
-            need_weights=False,cache=cache )[0]
-        return self.dropout1(x)
-    # feed forward block
-    def _ff_block(self, x: Tensor) -> Tensor:
-        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
-        return self.dropout2(x)
-class AdaptiveLayerNorm(nn.Module):
-    r"""Adaptive Layer Normalization"""
-    def __init__(self, d_model, norm) -> None:
-        super(AdaptiveLayerNorm, self).__init__()
-        self.project_layer = nn.Linear(d_model, 2 * d_model)
-        self.norm = norm
-        self.d_model = d_model
-        self.eps = self.norm.eps
-    def forward(self, input: Tensor, embedding: Tensor=None) -> Tensor:
-        if isinstance(input, tuple):
-            input, embedding = input
-            weight, bias = torch.split(
-                self.project_layer(embedding),
-                split_size_or_sections=self.d_model,
-                dim=-1, )
-            return (weight * self.norm(input) + bias, embedding)
-        weight, bias = torch.split(
-            self.project_layer(embedding),
-            split_size_or_sections=self.d_model,
-            dim=-1, )
-        return weight * self.norm(input) + bias
-def _get_clones(module, N):
-    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/text_processing/__init__.py DELETED Viewed

File without changes

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/text_processing/phonemizer.py DELETED Viewed

@@ -1,80 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/text_processing/phonemizer.py
-import itertools
-import re
-from typing import Dict
-from typing import List
-import regex
-from gruut import sentences
-from gruut.const import Sentence
-from gruut.const import Word
-from AR.text_processing.symbols import SYMBOL_TO_ID
-class GruutPhonemizer:
-    def __init__(self, language: str):
-        self._phonemizer = sentences
-        self.lang = language
-        self.symbol_to_id = SYMBOL_TO_ID
-        self._special_cases_dict: Dict[str] = {
-            r"\.\.\.": "... ",
-            ";": "; ",
-            ":": ": ",
-            ",": ", ",
-            r"\.": ". ",
-            "!": "! ",
-            r"\?": "? ",
-            "—": "—",
-            "…": "… ",
-            "«": "«",
-            "»": "»"
-        }
-        self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])"
-    def _normalize_punctuation(self, text: str) -> str:
-        text = regex.sub(fr"\pZ+{self._punctuation_regexp}", r"\1", text)
-        text = regex.sub(fr"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
-        text = regex.sub(r"\pZ+", r" ", text)
-        return text.strip()
-    def _convert_punctuation(self, word: Word) -> str:
-        if not word.phonemes:
-            return ''
-        if word.phonemes[0] in ['‖', '|']:
-            return word.text.strip()
-        phonemes = ''.join(word.phonemes)
-        # remove modifier characters ˈˌː with regex
-        phonemes = re.sub(r'[ˈˌː͡]', '', phonemes)
-        return phonemes.strip()
-    def phonemize(self, text: str, espeak: bool=False) -> str:
-        text_to_phonemize: str = self._normalize_punctuation(text)
-        sents: List[Sentence] = [
-            sent
-            for sent in self._phonemizer(
-                text_to_phonemize, lang="en-us", espeak=espeak)
-        ]
-        words: List[str] = [
-            self._convert_punctuation(word) for word in itertools.chain(*sents)
-        ]
-        return ' '.join(words)
-    def transform(self, phonemes):
-        # convert phonemes to ids
-        # dictionary is in symbols.py
-        return [
-            self.symbol_to_id[p] for p in phonemes
-            if p in self.symbol_to_id.keys()
-        ]
-if __name__ == "__main__":
-    phonemizer = GruutPhonemizer("en-us")
-    # text -> IPA
-    phonemes = phonemizer.phonemize("Hello, wor-ld ?")
-    print("phonemes:", phonemes)
-    print("len(phonemes):", len(phonemes))
-    phoneme_ids = phonemizer.transform(phonemes)
-    print("phoneme_ids:", phoneme_ids)
-    print("len(phoneme_ids):", len(phoneme_ids))

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/text_processing/symbols.py DELETED Viewed

@@ -1,9 +0,0 @@
-# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/text_processing/symbols.py
-PAD = '_'
-PUNCTUATION = ';:,.!?¡¿—…"«»“” '
-LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
-IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
-SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
-SPACE_ID = SYMBOLS.index(" ")
-SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
-ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/utils/__init__.py DELETED Viewed

@@ -1,37 +0,0 @@
-import re
-def str2bool(str):
-    return True if str.lower() == 'true' else False
-def get_newest_ckpt(string_list):
-    # 定义一个正则表达式模式，用于匹配字符串中的数字
-    pattern = r'epoch=(\d+)-step=(\d+)\.ckpt'
-    # 使用正则表达式提取每个字符串中的数字信息，并创建一个包含元组的列表
-    extracted_info = []
-    for string in string_list:
-        match = re.match(pattern, string)
-        if match:
-            epoch = int(match.group(1))
-            step = int(match.group(2))
-            extracted_info.append((epoch, step, string))
-    # 按照 epoch 后面的数字和 step 后面的数字进行排序
-    sorted_info = sorted(
-        extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
-    # 获取最新的 ckpt 文件名
-    newest_ckpt = sorted_info[0][2]
-    return newest_ckpt
-# 文本存在且不为空时 return True
-def check_txt_file(file_path):
-    try:
-        with open(file_path, 'r') as file:
-            text = file.readline().strip()
-        assert text.strip() != ''
-        return text
-    except Exception:
-        return False
-    return False

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/utils/initialize.py DELETED Viewed

@@ -1,38 +0,0 @@
-#!/usr/bin/env python3
-"""Initialize modules for espnet2 neural networks."""
-import torch
-from typeguard import check_argument_types
-def initialize(model: torch.nn.Module, init: str):
-    """Initialize weights of a neural network module.
-    Parameters are initialized using the given method or distribution.
-    Custom initialization routines can be implemented into submodules
-    as function `espnet_initialization_fn` within the custom module.
-    Args:
-        model: Target.
-        init: Method of initialization.
-    """
-    assert check_argument_types()
-    print("init with", init)
-    # weight init
-    for p in model.parameters():
-        if p.dim() > 1:
-            if init == "xavier_uniform":
-                torch.nn.init.xavier_uniform_(p.data)
-            elif init == "xavier_normal":
-                torch.nn.init.xavier_normal_(p.data)
-            elif init == "kaiming_uniform":
-                torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
-            elif init == "kaiming_normal":
-                torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
-            else:
-                raise ValueError("Unknown initialization: " + init)
-    # bias init
-    for name, p in model.named_parameters():
-        if ".bias" in name and p.dim() == 1:
-            p.data.zero_()

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/AR/utils/io.py DELETED Viewed

@@ -1,32 +0,0 @@
-import sys
-import torch
-import yaml
-def load_yaml_config(path):
-    with open(path) as f:
-        config = yaml.full_load(f)
-    return config
-def save_config_to_yaml(config, path):
-    assert path.endswith('.yaml')
-    with open(path, 'w') as f:
-        f.write(yaml.dump(config))
-        f.close()
-def write_args(args, path):
-    args_dict = dict((name, getattr(args, name)) for name in dir(args)
-                     if not name.startswith('_'))
-    with open(path, 'a') as args_file:
-        args_file.write('==> torch version: {}\n'.format(torch.__version__))
-        args_file.write(
-            '==> cudnn version: {}\n'.format(torch.backends.cudnn.version()))
-        args_file.write('==> Cmd:\n')
-        args_file.write(str(sys.argv))
-        args_file.write('\n==> args:\n')
-        for k, v in sorted(args_dict.items()):
-            args_file.write('  %s: %s\n' % (str(k), str(v)))
-        args_file.close()

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/configs/s1.yaml DELETED Viewed

@@ -1,31 +0,0 @@
-train:
-    seed: 1234
-    epochs: 300
-    batch_size: 8
-    gradient_accumulation: 4
-    save_every_n_epoch: 1
-    precision: 16
-    gradient_clip: 1.0
-optimizer:
-    lr: 0.01
-    lr_init: 0.00001
-    lr_end: 0.0001
-    warmup_steps: 2000
-    decay_steps: 40000
-data:
-    max_eval_sample: 8
-    max_sec: 54
-    num_workers: 1
-    pad_val: 1024 # same with EOS in model
-model:
-    vocab_size: 1025
-    phoneme_vocab_size: 512
-    embedding_dim: 512
-    hidden_dim: 512
-    head: 16
-    linear_units: 2048
-    n_layer: 12
-    dropout: 0
-    EOS: 1024
-inference:
-    top_k: 5

GPT-SoVITS-models/GPT-SoVITS/GPT_SoVITS/configs/s1big.yaml DELETED Viewed

@@ -1,31 +0,0 @@
-train:
-    seed: 1234
-    epochs: 300
-    batch_size: 8
-    gradient_accumulation: 4
-    save_every_n_epoch: 1
-    precision: 16-mixed
-    gradient_clip: 1.0
-optimizer:
-    lr: 0.01
-    lr_init: 0.00001
-    lr_end: 0.0001
-    warmup_steps: 2000
-    decay_steps: 40000
-data:
-    max_eval_sample: 8
-    max_sec: 54
-    num_workers: 1
-    pad_val: 1024 # same with EOS in model
-model:
-    vocab_size: 1025
-    phoneme_vocab_size: 512
-    embedding_dim: 1024
-    hidden_dim: 1024
-    head: 16
-    linear_units: 2048
-    n_layer: 16
-    dropout: 0
-    EOS: 1024
-inference:
-    top_k: 5