Spaces:
Paused
Paused
Upload 44 files
Browse files- Dockerfile +19 -4
- LICENSE +1 -1
- LICENSE-MoeGoe +21 -0
- app.py +336 -71
- docker-compose.yaml +3 -2
- models.py +1 -1
- optimizer_removal.py +16 -0
- request.py +265 -0
- test.py +11 -0
- text/__pycache__/__init__.cpython-310.pyc +0 -0
- text/__pycache__/cantonese.cpython-310.pyc +0 -0
- text/__pycache__/cleaners.cpython-310.pyc +0 -0
- text/__pycache__/english.cpython-310.pyc +0 -0
- text/__pycache__/japanese.cpython-310.pyc +0 -0
- text/__pycache__/korean.cpython-310.pyc +0 -0
- text/__pycache__/mandarin.cpython-310.pyc +0 -0
- text/__pycache__/ngu_dialect.cpython-310.pyc +0 -0
- text/__pycache__/shanghainese.cpython-310.pyc +0 -0
- text/cantonese.py +15 -4
- text/cleaners.py +140 -36
- text/mandarin.py +15 -3
- text/shanghainese.py +16 -5
- utils/__pycache__/merge.cpython-310.pyc +0 -0
- utils/__pycache__/nlp.cpython-310.pyc +0 -0
- utils/__pycache__/utils.cpython-310.pyc +0 -0
- utils/merge.py +161 -0
- utils/nlp.py +82 -0
- utils/utils.py +112 -0
- vits-simple-api-installer-latest.sh +27 -0
- voice.py +408 -153
Dockerfile
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
FROM python:3.
|
| 2 |
|
| 3 |
RUN mkdir -p /app
|
| 4 |
WORKDIR /app
|
|
@@ -7,16 +7,31 @@ ENV DEBIAN_FRONTEND=noninteractive
|
|
| 7 |
|
| 8 |
RUN apt-get update && \
|
| 9 |
apt install build-essential -yq && \
|
|
|
|
|
|
|
|
|
|
| 10 |
apt-get clean && \
|
| 11 |
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
| 12 |
rm -rf /var/lib/apt/lists/*
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
COPY requirements.txt /app
|
| 15 |
RUN pip install -r requirements.txt
|
| 16 |
|
| 17 |
COPY . /app
|
| 18 |
|
| 19 |
-
EXPOSE
|
| 20 |
-
|
| 21 |
-
CMD ["python", "/app/app.py"]
|
| 22 |
|
|
|
|
|
|
| 1 |
+
FROM python:3.10.11-slim-bullseye
|
| 2 |
|
| 3 |
RUN mkdir -p /app
|
| 4 |
WORKDIR /app
|
|
|
|
| 7 |
|
| 8 |
RUN apt-get update && \
|
| 9 |
apt install build-essential -yq && \
|
| 10 |
+
apt install espeak-ng -yq && \
|
| 11 |
+
apt install cmake -yq && \
|
| 12 |
+
apt install -y wget -yq && \
|
| 13 |
apt-get clean && \
|
| 14 |
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
| 15 |
rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
+
RUN pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0
|
| 18 |
+
|
| 19 |
+
RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
|
| 20 |
+
tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
|
| 21 |
+
cd openjtalk-0.3.0.dev2 && \
|
| 22 |
+
rm -rf ./pyopenjtalk/open_jtalk_dic_utf_8-1.11 && \
|
| 23 |
+
python setup.py install && \
|
| 24 |
+
cd ../ && \
|
| 25 |
+
rm -f openjtalk-0.3.0.dev2.tar.gz && \
|
| 26 |
+
rm -rf openjtalk-0.3.0.dev2
|
| 27 |
+
|
| 28 |
+
RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
|
| 29 |
+
|
| 30 |
COPY requirements.txt /app
|
| 31 |
RUN pip install -r requirements.txt
|
| 32 |
|
| 33 |
COPY . /app
|
| 34 |
|
| 35 |
+
EXPOSE 23456
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
CMD ["python", "/app/app.py"]
|
LICENSE
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
MIT License
|
| 2 |
|
| 3 |
-
Copyright (c)
|
| 4 |
|
| 5 |
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
of this software and associated documentation files (the "Software"), to deal
|
|
|
|
| 1 |
MIT License
|
| 2 |
|
| 3 |
+
Copyright (c) 2023 Artrajz
|
| 4 |
|
| 5 |
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
of this software and associated documentation files (the "Software"), to deal
|
LICENSE-MoeGoe
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2022 CjangCjengh
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
app.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
| 1 |
import os
|
| 2 |
-
import gradio as gr
|
| 3 |
import logging
|
|
|
|
|
|
|
| 4 |
import uuid
|
| 5 |
-
|
| 6 |
-
from flask import Flask, request, send_file, jsonify
|
| 7 |
from werkzeug.utils import secure_filename
|
| 8 |
from flask_apscheduler import APScheduler
|
| 9 |
-
|
| 10 |
-
from utils import clean_folder,
|
|
|
|
|
|
|
| 11 |
|
| 12 |
app = Flask(__name__)
|
| 13 |
app.config.from_pyfile("config.py")
|
|
@@ -16,104 +18,367 @@ scheduler = APScheduler()
|
|
| 16 |
scheduler.init_app(app)
|
| 17 |
scheduler.start()
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
CUSTOM_PATH = "/gradio"
|
| 24 |
|
| 25 |
if not os.path.exists(app.config['UPLOAD_FOLDER']):
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
| 31 |
|
| 32 |
-
|
| 33 |
-
@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
def index():
|
| 35 |
-
return "
|
| 36 |
|
| 37 |
|
| 38 |
@app.route('/voice/speakers', methods=["GET", "POST"])
|
| 39 |
def voice_speakers_api():
|
| 40 |
-
|
| 41 |
-
return jsonify(speakers_list)
|
| 42 |
|
| 43 |
|
| 44 |
@app.route('/voice', methods=["GET", "POST"])
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
| 71 |
|
| 72 |
|
| 73 |
-
@app.route('/voice/
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
return jsonify("method should be POST")
|
| 77 |
if request.method == "POST":
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
|
| 84 |
|
| 85 |
-
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
if voice_obj[original_id][2] != voice_obj[target_id][2]:
|
| 96 |
-
form["status"] = "error"
|
| 97 |
-
form["message"] = "speaker IDs are in diffrent Model!"
|
| 98 |
-
return form
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
file_type = f"audio/{format}"
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
| 105 |
-
# return output
|
| 106 |
|
| 107 |
|
| 108 |
-
|
| 109 |
-
@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
def clean_task():
|
| 111 |
clean_folder(app.config["UPLOAD_FOLDER"])
|
| 112 |
-
clean_folder(app.config["
|
| 113 |
|
| 114 |
|
| 115 |
if __name__ == '__main__':
|
| 116 |
-
|
| 117 |
-
app =
|
| 118 |
-
# app.run(host='0.0.0.0', port=app.config["PORT"]) # 如果对外开放用这个,docker部署也用这个
|
| 119 |
-
# app.run(host='127.0.0.1', port=app.config["PORT"], debug=True) # 本地运行、调试
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import logging
|
| 3 |
+
import time
|
| 4 |
+
import logzero
|
| 5 |
import uuid
|
| 6 |
+
from flask import Flask, request, send_file, jsonify, make_response
|
|
|
|
| 7 |
from werkzeug.utils import secure_filename
|
| 8 |
from flask_apscheduler import APScheduler
|
| 9 |
+
from functools import wraps
|
| 10 |
+
from utils.utils import clean_folder, check_is_none
|
| 11 |
+
from utils.merge import merge_model
|
| 12 |
+
from io import BytesIO
|
| 13 |
|
| 14 |
app = Flask(__name__)
|
| 15 |
app.config.from_pyfile("config.py")
|
|
|
|
| 18 |
scheduler.init_app(app)
|
| 19 |
scheduler.start()
|
| 20 |
|
| 21 |
+
logzero.loglevel(logging.WARNING)
|
| 22 |
+
logger = logging.getLogger("vits-simple-api")
|
| 23 |
+
level = app.config.get("LOGGING_LEVEL", "DEBUG")
|
| 24 |
+
level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
|
| 25 |
+
'CRITICAL': logging.CRITICAL}
|
| 26 |
+
logging.basicConfig(level=level_dict[level])
|
| 27 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
| 28 |
|
| 29 |
+
tts = merge_model(app.config["MODEL_LIST"])
|
|
|
|
|
|
|
| 30 |
|
| 31 |
if not os.path.exists(app.config['UPLOAD_FOLDER']):
|
| 32 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
| 33 |
+
|
| 34 |
+
if not os.path.exists(app.config['CACHE_PATH']):
|
| 35 |
+
os.makedirs(app.config['CACHE_PATH'], exist_ok=True)
|
| 36 |
|
| 37 |
|
| 38 |
+
def require_api_key(func):
|
| 39 |
+
@wraps(func)
|
| 40 |
+
def check_api_key(*args, **kwargs):
|
| 41 |
+
if not app.config.get('API_KEY_ENABLED', False):
|
| 42 |
+
return func(*args, **kwargs)
|
| 43 |
+
else:
|
| 44 |
+
api_key = request.args.get('api_key') or request.headers.get('X-API-KEY')
|
| 45 |
+
if api_key and api_key == app.config['API_KEY']:
|
| 46 |
+
return func(*args, **kwargs)
|
| 47 |
+
else:
|
| 48 |
+
return make_response(jsonify({"status": "error", "message": "Invalid API Key"}), 401)
|
| 49 |
+
|
| 50 |
+
return check_api_key
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@app.route('/', methods=["GET", "POST"])
|
| 54 |
def index():
|
| 55 |
+
return "vits-simple-api"
|
| 56 |
|
| 57 |
|
| 58 |
@app.route('/voice/speakers', methods=["GET", "POST"])
|
| 59 |
def voice_speakers_api():
|
| 60 |
+
return jsonify(tts.voice_speakers)
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
@app.route('/voice', methods=["GET", "POST"])
|
| 64 |
+
@app.route('/voice/vits', methods=["GET", "POST"])
|
| 65 |
+
@require_api_key
|
| 66 |
+
def voice_vits_api():
|
| 67 |
+
try:
|
| 68 |
+
if request.method == "GET":
|
| 69 |
+
text = request.args.get("text", "")
|
| 70 |
+
id = int(request.args.get("id", app.config.get("ID", 0)))
|
| 71 |
+
format = request.args.get("format", app.config.get("FORMAT", "wav"))
|
| 72 |
+
lang = request.args.get("lang", app.config.get("LANG", "auto"))
|
| 73 |
+
length = float(request.args.get("length", app.config.get("LENGTH", 1)))
|
| 74 |
+
noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
|
| 75 |
+
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 76 |
+
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
| 77 |
+
elif request.method == "POST":
|
| 78 |
+
text = request.form.get("text", "")
|
| 79 |
+
id = int(request.form.get("id", app.config.get("ID", 0)))
|
| 80 |
+
format = request.form.get("format", app.config.get("FORMAT", "wav"))
|
| 81 |
+
lang = request.form.get("lang", app.config.get("LANG", "auto"))
|
| 82 |
+
length = float(request.form.get("length", app.config.get("LENGTH", 1)))
|
| 83 |
+
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
| 84 |
+
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 85 |
+
max = int(request.form.get("max", app.config.get("MAX", 50)))
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"[VITS] {e}")
|
| 88 |
+
return make_response("parameter error", 400)
|
| 89 |
+
|
| 90 |
+
logger.info(f"[VITS] id:{id} format:{format} lang:{lang} length:{length} noise:{noise} noisew:{noisew}")
|
| 91 |
+
logger.info(f"[VITS] len:{len(text)} text:{text}")
|
| 92 |
+
|
| 93 |
+
if check_is_none(text):
|
| 94 |
+
logger.info(f"[VITS] text is empty")
|
| 95 |
+
return make_response(jsonify({"status": "error", "message": "text is empty"}), 400)
|
| 96 |
+
|
| 97 |
+
if check_is_none(id):
|
| 98 |
+
logger.info(f"[VITS] speaker id is empty")
|
| 99 |
+
return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
|
| 100 |
+
|
| 101 |
+
if id < 0 or id >= tts.vits_speakers_count:
|
| 102 |
+
logger.info(f"[VITS] speaker id {id} does not exist")
|
| 103 |
+
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
| 104 |
+
|
| 105 |
+
speaker_lang = tts.voice_speakers["VITS"][id].get('lang')
|
| 106 |
+
if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
|
| 107 |
+
logger.info(f"[VITS] lang \"{lang}\" is not in {speaker_lang}")
|
| 108 |
+
return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
|
| 109 |
+
|
| 110 |
+
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
| 111 |
+
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
| 112 |
+
|
| 113 |
+
fname = f"{str(uuid.uuid1())}.{format}"
|
| 114 |
+
file_type = f"audio/{format}"
|
| 115 |
+
|
| 116 |
+
t1 = time.time()
|
| 117 |
+
output = tts.vits_infer({"text": text,
|
| 118 |
+
"id": id,
|
| 119 |
+
"format": format,
|
| 120 |
+
"length": length,
|
| 121 |
+
"noise": noise,
|
| 122 |
+
"noisew": noisew,
|
| 123 |
+
"max": max,
|
| 124 |
+
"lang": lang,
|
| 125 |
+
"speaker_lang": speaker_lang})
|
| 126 |
+
t2 = time.time()
|
| 127 |
+
logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
|
| 128 |
|
| 129 |
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
| 130 |
|
| 131 |
|
| 132 |
+
@app.route('/voice/hubert-vits', methods=["POST"])
|
| 133 |
+
@require_api_key
|
| 134 |
+
def voice_hubert_api():
|
|
|
|
| 135 |
if request.method == "POST":
|
| 136 |
+
try:
|
| 137 |
+
voice = request.files['upload']
|
| 138 |
+
id = int(request.form.get("id"))
|
| 139 |
+
format = request.form.get("format", app.config.get("LANG", "auto"))
|
| 140 |
+
length = float(request.form.get("length", app.config.get("LENGTH", 1)))
|
| 141 |
+
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
| 142 |
+
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"[hubert] {e}")
|
| 145 |
+
return make_response("parameter error", 400)
|
| 146 |
|
| 147 |
+
logger.info(f"[hubert] id:{id} format:{format} length:{length} noise:{noise} noisew:{noisew}")
|
| 148 |
|
| 149 |
+
fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
|
| 150 |
+
voice.save(os.path.join(app.config['UPLOAD_FOLDER'], fname))
|
| 151 |
|
| 152 |
+
if check_is_none(id):
|
| 153 |
+
logger.info(f"[hubert] speaker id is empty")
|
| 154 |
+
return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
|
| 155 |
+
|
| 156 |
+
if id < 0 or id >= tts.hubert_speakers_count:
|
| 157 |
+
logger.info(f"[hubert] speaker id {id} does not exist")
|
| 158 |
+
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
| 159 |
+
|
| 160 |
+
file_type = f"audio/{format}"
|
| 161 |
+
|
| 162 |
+
t1 = time.time()
|
| 163 |
+
output = tts.hubert_vits_infer({"id": id,
|
| 164 |
+
"format": format,
|
| 165 |
+
"length": length,
|
| 166 |
+
"noise": noise,
|
| 167 |
+
"noisew": noisew,
|
| 168 |
+
"audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)})
|
| 169 |
+
t2 = time.time()
|
| 170 |
+
logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
|
| 171 |
+
|
| 172 |
+
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
@app.route('/voice/w2v2-vits', methods=["GET", "POST"])
|
| 176 |
+
@require_api_key
|
| 177 |
+
def voice_w2v2_api():
|
| 178 |
+
try:
|
| 179 |
+
if request.method == "GET":
|
| 180 |
+
text = request.args.get("text", "")
|
| 181 |
+
id = int(request.args.get("id", app.config.get("ID", 0)))
|
| 182 |
+
format = request.args.get("format", app.config.get("FORMAT", "wav"))
|
| 183 |
+
lang = request.args.get("lang", app.config.get("LANG", "auto"))
|
| 184 |
+
length = float(request.args.get("length", app.config.get("LENGTH", 1)))
|
| 185 |
+
noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
|
| 186 |
+
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 187 |
+
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
| 188 |
+
emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
|
| 189 |
+
elif request.method == "POST":
|
| 190 |
+
text = request.form.get("text", "")
|
| 191 |
+
id = int(request.form.get("id", app.config.get("ID", 0)))
|
| 192 |
+
format = request.form.get("format", app.config.get("FORMAT", "wav"))
|
| 193 |
+
lang = request.form.get("lang", app.config.get("LANG", "auto"))
|
| 194 |
+
length = float(request.form.get("length"))
|
| 195 |
+
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
| 196 |
+
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 197 |
+
max = int(request.form.get("max", app.config.get("MAX", 50)))
|
| 198 |
+
emotion = int(request.form.get("emotion", app.config.get("EMOTION", 0)))
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.error(f"[w2v2] {e}")
|
| 201 |
+
return make_response(f"parameter error", 400)
|
| 202 |
+
|
| 203 |
+
logger.info(f"[w2v2] id:{id} format:{format} lang:{lang} "
|
| 204 |
+
f"length:{length} noise:{noise} noisew:{noisew} emotion:{emotion}")
|
| 205 |
+
logger.info(f"[w2v2] len:{len(text)} text:{text}")
|
| 206 |
+
|
| 207 |
+
if check_is_none(text):
|
| 208 |
+
logger.info(f"[w2v2] text is empty")
|
| 209 |
+
return make_response(jsonify({"status": "error", "message": "text is empty"}), 400)
|
| 210 |
+
|
| 211 |
+
if check_is_none(id):
|
| 212 |
+
logger.info(f"[w2v2] speaker id is empty")
|
| 213 |
+
return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
|
| 214 |
+
|
| 215 |
+
if id < 0 or id >= tts.w2v2_speakers_count:
|
| 216 |
+
logger.info(f"[w2v2] speaker id {id} does not exist")
|
| 217 |
+
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
| 218 |
+
|
| 219 |
+
speaker_lang = tts.voice_speakers["W2V2-VITS"][id].get('lang')
|
| 220 |
+
if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
|
| 221 |
+
logger.info(f"[w2v2] lang \"{lang}\" is not in {speaker_lang}")
|
| 222 |
+
return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
|
| 223 |
+
|
| 224 |
+
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
| 225 |
+
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
| 226 |
+
|
| 227 |
+
fname = f"{str(uuid.uuid1())}.{format}"
|
| 228 |
+
file_type = f"audio/{format}"
|
| 229 |
|
| 230 |
+
t1 = time.time()
|
| 231 |
+
output = tts.w2v2_vits_infer({"text": text,
|
| 232 |
+
"id": id,
|
| 233 |
+
"format": format,
|
| 234 |
+
"length": length,
|
| 235 |
+
"noise": noise,
|
| 236 |
+
"noisew": noisew,
|
| 237 |
+
"max": max,
|
| 238 |
+
"lang": lang,
|
| 239 |
+
"emotion": emotion,
|
| 240 |
+
"speaker_lang": speaker_lang})
|
| 241 |
+
t2 = time.time()
|
| 242 |
+
logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
|
| 243 |
+
|
| 244 |
+
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
| 245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
+
@app.route('/voice/conversion', methods=["POST"])
|
| 248 |
+
@app.route('/voice/vits/conversion', methods=["POST"])
|
| 249 |
+
@require_api_key
|
| 250 |
+
def vits_voice_conversion_api():
|
| 251 |
+
if request.method == "POST":
|
| 252 |
+
try:
|
| 253 |
+
voice = request.files['upload']
|
| 254 |
+
original_id = int(request.form["original_id"])
|
| 255 |
+
target_id = int(request.form["target_id"])
|
| 256 |
+
format = request.form.get("format", voice.filename.split(".")[1])
|
| 257 |
+
except Exception as e:
|
| 258 |
+
logger.error(f"[vits_voice_convertsion] {e}")
|
| 259 |
+
return make_response("parameter error", 400)
|
| 260 |
+
|
| 261 |
+
fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
|
| 262 |
+
audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
|
| 263 |
+
voice.save(audio_path)
|
| 264 |
file_type = f"audio/{format}"
|
| 265 |
|
| 266 |
+
logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
|
| 267 |
+
t1 = time.time()
|
| 268 |
+
try:
|
| 269 |
+
output = tts.vits_voice_conversion({"audio_path": audio_path,
|
| 270 |
+
"original_id": original_id,
|
| 271 |
+
"target_id": target_id,
|
| 272 |
+
"format": format})
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logger.info(f"[vits_voice_convertsion] {e}")
|
| 275 |
+
return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
|
| 276 |
+
t2 = time.time()
|
| 277 |
+
logger.info(f"finish in {(t2 - t1):.2f}s")
|
| 278 |
+
|
| 279 |
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
|
|
|
| 280 |
|
| 281 |
|
| 282 |
+
@app.route('/voice/ssml', methods=["POST"])
|
| 283 |
+
@require_api_key
|
| 284 |
+
def ssml():
|
| 285 |
+
try:
|
| 286 |
+
ssml = request.form["ssml"]
|
| 287 |
+
except Exception as e:
|
| 288 |
+
logger.info(f"[ssml] {e}")
|
| 289 |
+
return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
|
| 290 |
+
|
| 291 |
+
logger.debug(ssml)
|
| 292 |
+
|
| 293 |
+
t1 = time.time()
|
| 294 |
+
try:
|
| 295 |
+
output, format = tts.create_ssml_infer_task(ssml)
|
| 296 |
+
except Exception as e:
|
| 297 |
+
logger.info(f"[ssml] {e}")
|
| 298 |
+
return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
|
| 299 |
+
t2 = time.time()
|
| 300 |
+
|
| 301 |
+
fname = f"{str(uuid.uuid1())}.{format}"
|
| 302 |
+
file_type = f"audio/{format}"
|
| 303 |
+
|
| 304 |
+
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
| 305 |
+
|
| 306 |
+
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
@app.route('/voice/dimension-emotion', methods=["POST"])
|
| 310 |
+
def dimensional_emotion():
|
| 311 |
+
if request.method == "POST":
|
| 312 |
+
try:
|
| 313 |
+
audio = request.files['upload']
|
| 314 |
+
except Exception as e:
|
| 315 |
+
logger.error(f"[dimensional_emotion] {e}")
|
| 316 |
+
return make_response("parameter error", 400)
|
| 317 |
+
|
| 318 |
+
content = BytesIO(audio.read())
|
| 319 |
+
|
| 320 |
+
file_type = "application/octet-stream; charset=ascii"
|
| 321 |
+
fname = os.path.splitext(audio.filename)[0] + ".npy"
|
| 322 |
+
output = tts.get_dimensional_emotion_npy(content)
|
| 323 |
+
|
| 324 |
+
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
@app.route('/voice/check', methods=["GET", "POST"])
|
| 328 |
+
def check():
|
| 329 |
+
try:
|
| 330 |
+
if request.method == "GET":
|
| 331 |
+
model = request.args.get("model")
|
| 332 |
+
id = int(request.args.get("id"))
|
| 333 |
+
elif request.method == "POST":
|
| 334 |
+
model = request.form["model"]
|
| 335 |
+
id = int(request.form["id"])
|
| 336 |
+
except Exception as e:
|
| 337 |
+
logger.info(f"[check] {e}")
|
| 338 |
+
return make_response(jsonify({"status": "error", "message": "parameter error"}), 400)
|
| 339 |
+
|
| 340 |
+
if check_is_none(model):
|
| 341 |
+
logger.info(f"[check] model {model} is empty")
|
| 342 |
+
return make_response(jsonify({"status": "error", "message": "model is empty"}), 400)
|
| 343 |
+
|
| 344 |
+
if model.upper() not in ("VITS", "HUBERT", "W2V2"):
|
| 345 |
+
res = make_response(jsonify({"status": "error", "message": f"model {model} does not exist"}))
|
| 346 |
+
res.status = 404
|
| 347 |
+
logger.info(f"[check] speaker id {id} error")
|
| 348 |
+
return res
|
| 349 |
+
|
| 350 |
+
if check_is_none(id):
|
| 351 |
+
logger.info(f"[check] speaker id is empty")
|
| 352 |
+
return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
|
| 353 |
+
|
| 354 |
+
if model.upper() == "VITS":
|
| 355 |
+
speaker_list = tts.voice_speakers["VITS"]
|
| 356 |
+
elif model.upper() == "HUBERT":
|
| 357 |
+
speaker_list = tts.voice_speakers["HUBERT-VITS"]
|
| 358 |
+
elif model.upper() == "W2V2":
|
| 359 |
+
speaker_list = tts.voice_speakers["W2V2-VITS"]
|
| 360 |
+
|
| 361 |
+
if len(speaker_list) == 0:
|
| 362 |
+
logger.info(f"[check] {model} not loaded")
|
| 363 |
+
return make_response(jsonify({"status": "error", "message": f"{model} not loaded"}), 400)
|
| 364 |
+
|
| 365 |
+
if id < 0 or id >= len(speaker_list):
|
| 366 |
+
logger.info(f"[check] speaker id {id} does not exist")
|
| 367 |
+
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
| 368 |
+
name = str(speaker_list[id]["name"])
|
| 369 |
+
lang = speaker_list[id]["lang"]
|
| 370 |
+
logger.info(f"[check] check id:{id} name:{name} lang:{lang}")
|
| 371 |
+
|
| 372 |
+
return make_response(jsonify({"status": "success", "id": id, "name": name, "lang": lang}), 200)
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
# regular cleaning
|
| 376 |
+
@scheduler.task('interval', id='clean_task', seconds=3600, misfire_grace_time=900)
|
| 377 |
def clean_task():
|
| 378 |
clean_folder(app.config["UPLOAD_FOLDER"])
|
| 379 |
+
clean_folder(app.config["CACHE_PATH"])
|
| 380 |
|
| 381 |
|
| 382 |
if __name__ == '__main__':
|
| 383 |
+
app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False)) # 对外开放
|
| 384 |
+
# app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True) # 本地运行、调试
|
|
|
|
|
|
docker-compose.yaml
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
version: '3.4'
|
| 2 |
services:
|
| 3 |
-
|
| 4 |
-
image: artrajz/
|
| 5 |
restart: always
|
| 6 |
ports:
|
| 7 |
- 23456:23456
|
| 8 |
environment:
|
| 9 |
LANG: 'C.UTF-8'
|
|
|
|
| 10 |
volumes:
|
| 11 |
- ./Model:/app/Model # 挂载模型文件夹
|
| 12 |
- ./config.py:/app/config.py # 挂载配置文件
|
|
|
|
| 1 |
version: '3.4'
|
| 2 |
services:
|
| 3 |
+
vits:
|
| 4 |
+
image: artrajz/vits-simple-api:latest
|
| 5 |
restart: always
|
| 6 |
ports:
|
| 7 |
- 23456:23456
|
| 8 |
environment:
|
| 9 |
LANG: 'C.UTF-8'
|
| 10 |
+
TZ: Asia/Shanghai #timezone
|
| 11 |
volumes:
|
| 12 |
- ./Model:/app/Model # 挂载模型文件夹
|
| 13 |
- ./config.py:/app/config.py # 挂载配置文件
|
models.py
CHANGED
|
@@ -363,7 +363,7 @@ class SynthesizerTrn(nn.Module):
|
|
| 363 |
else:
|
| 364 |
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
| 365 |
|
| 366 |
-
if n_speakers
|
| 367 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
| 368 |
|
| 369 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
|
|
|
|
| 363 |
else:
|
| 364 |
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
| 365 |
|
| 366 |
+
if n_speakers >= 1:
|
| 367 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
| 368 |
|
| 369 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
|
optimizer_removal.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch import load, save
|
| 2 |
+
|
| 3 |
+
if __name__ == '__main__':
|
| 4 |
+
print("优化器通常不会被用于推理阶段,如果只用于推理可以去除优化器以减小模型体积\n")
|
| 5 |
+
input_path = input("请输入模型的路径:")
|
| 6 |
+
output_path = f"{input_path.split('.')[0]}_inference.pth"
|
| 7 |
+
checkpoint_dict = load(input_path, map_location='cpu')
|
| 8 |
+
checkpoint_dict_new = {}
|
| 9 |
+
for k, v in checkpoint_dict.items():
|
| 10 |
+
if k == "optimizer":
|
| 11 |
+
print(f"remove optimizer")
|
| 12 |
+
continue
|
| 13 |
+
checkpoint_dict_new[k] = v
|
| 14 |
+
save(checkpoint_dict_new, output_path)
|
| 15 |
+
print("finish")
|
| 16 |
+
print(output_path)
|
request.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import requests
|
| 3 |
+
import os
|
| 4 |
+
import random
|
| 5 |
+
import string
|
| 6 |
+
from requests_toolbelt.multipart.encoder import MultipartEncoder
|
| 7 |
+
|
| 8 |
+
abs_path = os.path.dirname(__file__)
|
| 9 |
+
base = "http://127.0.0.1:23456"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# 映射表
|
| 13 |
+
def voice_speakers():
|
| 14 |
+
url = f"{base}/voice/speakers"
|
| 15 |
+
|
| 16 |
+
res = requests.post(url=url)
|
| 17 |
+
json = res.json()
|
| 18 |
+
for i in json:
|
| 19 |
+
print(i)
|
| 20 |
+
for j in json[i]:
|
| 21 |
+
print(j)
|
| 22 |
+
return json
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# 语音合成 voice vits
|
| 26 |
+
def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50):
|
| 27 |
+
fields = {
|
| 28 |
+
"text": text,
|
| 29 |
+
"id": str(id),
|
| 30 |
+
"format": format,
|
| 31 |
+
"lang": lang,
|
| 32 |
+
"length": str(length),
|
| 33 |
+
"noise": str(noise),
|
| 34 |
+
"noisew": str(noisew),
|
| 35 |
+
"max": str(max)
|
| 36 |
+
}
|
| 37 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
| 38 |
+
|
| 39 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
| 40 |
+
headers = {"Content-Type": m.content_type}
|
| 41 |
+
url = f"{base}/voice"
|
| 42 |
+
|
| 43 |
+
res = requests.post(url=url, data=m, headers=headers)
|
| 44 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
| 45 |
+
path = f"{abs_path}/{fname}"
|
| 46 |
+
|
| 47 |
+
with open(path, "wb") as f:
|
| 48 |
+
f.write(res.content)
|
| 49 |
+
print(path)
|
| 50 |
+
return path
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# 语音转换 hubert-vits
|
| 54 |
+
def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8):
|
| 55 |
+
upload_name = os.path.basename(upload_path)
|
| 56 |
+
upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
|
| 57 |
+
|
| 58 |
+
with open(upload_path, 'rb') as upload_file:
|
| 59 |
+
fields = {
|
| 60 |
+
"upload": (upload_name, upload_file, upload_type),
|
| 61 |
+
"id": str(id),
|
| 62 |
+
"format": format,
|
| 63 |
+
"length": str(length),
|
| 64 |
+
"noise": str(noise),
|
| 65 |
+
"noisew": str(noisew),
|
| 66 |
+
}
|
| 67 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
| 68 |
+
|
| 69 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
| 70 |
+
headers = {"Content-Type": m.content_type}
|
| 71 |
+
url = f"{base}/voice/hubert-vits"
|
| 72 |
+
|
| 73 |
+
res = requests.post(url=url, data=m, headers=headers)
|
| 74 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
| 75 |
+
path = f"{abs_path}/{fname}"
|
| 76 |
+
|
| 77 |
+
with open(path, "wb") as f:
|
| 78 |
+
f.write(res.content)
|
| 79 |
+
print(path)
|
| 80 |
+
return path
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# 维度情感模型 w2v2-vits
|
| 84 |
+
def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50, emotion=0):
|
| 85 |
+
fields = {
|
| 86 |
+
"text": text,
|
| 87 |
+
"id": str(id),
|
| 88 |
+
"format": format,
|
| 89 |
+
"lang": lang,
|
| 90 |
+
"length": str(length),
|
| 91 |
+
"noise": str(noise),
|
| 92 |
+
"noisew": str(noisew),
|
| 93 |
+
"max": str(max),
|
| 94 |
+
"emotion": str(emotion)
|
| 95 |
+
}
|
| 96 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
| 97 |
+
|
| 98 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
| 99 |
+
headers = {"Content-Type": m.content_type}
|
| 100 |
+
url = f"{base}/voice/w2v2-vits"
|
| 101 |
+
|
| 102 |
+
res = requests.post(url=url, data=m, headers=headers)
|
| 103 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
| 104 |
+
path = f"{abs_path}/{fname}"
|
| 105 |
+
|
| 106 |
+
with open(path, "wb") as f:
|
| 107 |
+
f.write(res.content)
|
| 108 |
+
print(path)
|
| 109 |
+
return path
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# 语音转换 同VITS模型内角色之间的音色转换
|
| 113 |
+
def voice_conversion(upload_path, original_id, target_id):
|
| 114 |
+
upload_name = os.path.basename(upload_path)
|
| 115 |
+
upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
|
| 116 |
+
|
| 117 |
+
with open(upload_path, 'rb') as upload_file:
|
| 118 |
+
fields = {
|
| 119 |
+
"upload": (upload_name, upload_file, upload_type),
|
| 120 |
+
"original_id": str(original_id),
|
| 121 |
+
"target_id": str(target_id),
|
| 122 |
+
}
|
| 123 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
| 124 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
| 125 |
+
|
| 126 |
+
headers = {"Content-Type": m.content_type}
|
| 127 |
+
url = f"{base}/voice/conversion"
|
| 128 |
+
|
| 129 |
+
res = requests.post(url=url, data=m, headers=headers)
|
| 130 |
+
|
| 131 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
| 132 |
+
path = f"{abs_path}/{fname}"
|
| 133 |
+
|
| 134 |
+
with open(path, "wb") as f:
|
| 135 |
+
f.write(res.content)
|
| 136 |
+
print(path)
|
| 137 |
+
return path
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def voice_ssml(ssml):
|
| 141 |
+
fields = {
|
| 142 |
+
"ssml": ssml,
|
| 143 |
+
}
|
| 144 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
| 145 |
+
|
| 146 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
| 147 |
+
headers = {"Content-Type": m.content_type}
|
| 148 |
+
url = f"{base}/voice/ssml"
|
| 149 |
+
|
| 150 |
+
res = requests.post(url=url, data=m, headers=headers)
|
| 151 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
| 152 |
+
path = f"{abs_path}/{fname}"
|
| 153 |
+
|
| 154 |
+
with open(path, "wb") as f:
|
| 155 |
+
f.write(res.content)
|
| 156 |
+
print(path)
|
| 157 |
+
return path
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def voice_dimensional_emotion(upload_path):
|
| 161 |
+
upload_name = os.path.basename(upload_path)
|
| 162 |
+
upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
|
| 163 |
+
|
| 164 |
+
with open(upload_path, 'rb') as upload_file:
|
| 165 |
+
fields = {
|
| 166 |
+
"upload": (upload_name, upload_file, upload_type),
|
| 167 |
+
}
|
| 168 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
| 169 |
+
|
| 170 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
| 171 |
+
headers = {"Content-Type": m.content_type}
|
| 172 |
+
url = f"{base}/voice/dimension-emotion"
|
| 173 |
+
|
| 174 |
+
res = requests.post(url=url, data=m, headers=headers)
|
| 175 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
| 176 |
+
path = f"{abs_path}/{fname}"
|
| 177 |
+
|
| 178 |
+
with open(path, "wb") as f:
|
| 179 |
+
f.write(res.content)
|
| 180 |
+
print(path)
|
| 181 |
+
return path
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
import time
|
| 185 |
+
|
| 186 |
+
# while 1:
|
| 187 |
+
# text = input()
|
| 188 |
+
# l = len(text)
|
| 189 |
+
# time1 = time.time()
|
| 190 |
+
# voice_vits(text)
|
| 191 |
+
# time2 = time.time()
|
| 192 |
+
# print(f"len:{l}耗时:{time2 - time1}")
|
| 193 |
+
|
| 194 |
+
# text = "你好"
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
# ssml = """
|
| 198 |
+
# <speak lang="zh" format="mp3" length="1.2">
|
| 199 |
+
# <voice id="92" >这几天心里颇不宁静。</voice>
|
| 200 |
+
# <voice id="125">今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
|
| 201 |
+
# <voice id="142">月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice>
|
| 202 |
+
# <voice id="98">妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。</voice>
|
| 203 |
+
# <voice id="120">我悄悄地披了大衫,带上门出去。</voice><break time="2s"/>
|
| 204 |
+
# <voice id="121">沿着荷塘,是一条曲折的小煤屑路。</voice>
|
| 205 |
+
# <voice id="122">这是一条幽僻的路;白天也少人走,夜晚更加寂寞。</voice>
|
| 206 |
+
# <voice id="123">荷塘四面,长着许多树,蓊蓊郁郁的。</voice>
|
| 207 |
+
# <voice id="124">路的一旁,是些杨柳,和一些不知道名字的树。</voice>
|
| 208 |
+
# <voice id="125">没有月光的晚上,这路上阴森森的,有些怕人。</voice>
|
| 209 |
+
# <voice id="126">今晚却很好,虽然月光也还是淡淡的。</voice><break time="2s"/>
|
| 210 |
+
# <voice id="127">路上只我一个人,背着手踱着。</voice>
|
| 211 |
+
# <voice id="128">这一片天地好像是我的;我也像超出了平常的自己,到了另一个世界里。</voice>
|
| 212 |
+
# <voice id="129">我爱热闹,也爱冷静;<break strength="x-weak"/>爱群居,也爱独处。</voice>
|
| 213 |
+
# <voice id="130">像今晚上,一个人在这苍茫的月下,什么都可以想,什么都可以不想,便觉是个自由的人。</voice>
|
| 214 |
+
# <voice id="131">白天里一定要做的事,一定要说的话,现在都可不理。</voice>
|
| 215 |
+
# <voice id="132">这是独处的妙处,我且受用这无边的荷香月色好了。</voice>
|
| 216 |
+
# </speak>
|
| 217 |
+
# """
|
| 218 |
+
# ssml = """
|
| 219 |
+
# <speak lang="zh">
|
| 220 |
+
# <voice id="92" length="1.4">这几天心里颇不宁静。今晚<break/>在院子里坐着乘凉,忽然想起<break/>日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
|
| 221 |
+
# <voice id="142" length="1.4">月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice><break time="2s"/>
|
| 222 |
+
# <voice id="0" length="1.4" model="w2v2-vits" lang="ja">こんにちは</voice>
|
| 223 |
+
# </speak>
|
| 224 |
+
# """
|
| 225 |
+
# ssml = """
|
| 226 |
+
# <speak lang="ja">
|
| 227 |
+
# <voice id="142" length="1.4">こんにちは</voice>
|
| 228 |
+
# <voice id="0" length="1.4" model="w2v2-vits" emotion="177">こんにちは</voice>
|
| 229 |
+
# <voice id="0" length="1.4" model="w2v2-vits">こんにちは</voice>
|
| 230 |
+
# </speak>
|
| 231 |
+
# """
|
| 232 |
+
ssml = """
|
| 233 |
+
<speak lang="auto">
|
| 234 |
+
<voice>这几天心里颇不宁静。</voice>
|
| 235 |
+
<voice>今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
|
| 236 |
+
<voice>月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice>
|
| 237 |
+
<voice>妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。</voice>
|
| 238 |
+
<voice>我悄悄地披了大衫,带上门出去。</voice><break time="2s"/>
|
| 239 |
+
<voice>沿着荷塘,是一条曲折的小煤屑路。</voice>
|
| 240 |
+
<voice>这是一条幽僻的路;白天也少人走,夜晚更加寂寞。</voice>
|
| 241 |
+
<voice>荷塘四面,长着许多树,蓊蓊郁郁的。</voice>
|
| 242 |
+
<voice>路的一旁,是些杨柳,和一些不知道名字的树。</voice>
|
| 243 |
+
<voice>没有月光的晚上,这路上阴森森的,有些怕人。</voice>
|
| 244 |
+
<voice>今晚却很好,虽然月光也还是淡淡的。</voice><break time="2s"/>
|
| 245 |
+
<voice>路上只我一个人,背着手踱着。</voice>
|
| 246 |
+
<voice>这一片天地好像是我的;我也像超出了平常的自己,到了另一个世界里。</voice>
|
| 247 |
+
<voice>我爱热闹,也爱冷静;<break strength="x-weak"/>爱群居,也爱独处。</voice>
|
| 248 |
+
<voice>像今晚上,一个人在这苍茫的月下,什么都可以想,什么都可以不想,便觉是个自由的人。</voice>
|
| 249 |
+
<voice>白天里一定要做的事,一定要说的话,现在都可不理。</voice>
|
| 250 |
+
<voice>这是独处的妙处,我且受用这无边的荷香月色好了。</voice>
|
| 251 |
+
</speak>
|
| 252 |
+
"""
|
| 253 |
+
|
| 254 |
+
text = """猫咪是爱撒娇、爱玩耍的小家伙,通常有着柔软的绒毛和温柔的眼神,是许多人都喜欢的宠物哦~它们特别喜欢舔自己的毛发,用柔顺的小脑袋搓人的脚丫子,还能给人带来很多欢乐和温馨。
|
| 255 |
+
"""
|
| 256 |
+
t1 = time.time()
|
| 257 |
+
# voice_conversion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav", 91, 93)
|
| 258 |
+
# voice_hubert_vits("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav",0)
|
| 259 |
+
# voice_vits(text,format="wav",lang="zh")
|
| 260 |
+
# voice_w2v2_vits(text,emotion=111)
|
| 261 |
+
# os.system(voice_ssml(ssml))
|
| 262 |
+
os.system(voice_vits(text,id=0, format="wav", max=0))
|
| 263 |
+
# voice_dimensional_emotion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav")
|
| 264 |
+
t2 = time.time()
|
| 265 |
+
print(f"len:{len(text)}耗时:{t2 - t1}")
|
test.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
|
| 4 |
+
array = np.array([1, 2, 3])
|
| 5 |
+
|
| 6 |
+
npy = BytesIO()
|
| 7 |
+
np.save(npy,array)
|
| 8 |
+
npy.seek(0)
|
| 9 |
+
tmp = np.load("H:\git/vits-simple-api\Model/npy/25ecb3f6-f968-11ed-b094-e0d4e84af078.npy")
|
| 10 |
+
print(tmp)
|
| 11 |
+
|
text/__pycache__/__init__.cpython-310.pyc
CHANGED
|
Binary files a/text/__pycache__/__init__.cpython-310.pyc and b/text/__pycache__/__init__.cpython-310.pyc differ
|
|
|
text/__pycache__/cantonese.cpython-310.pyc
ADDED
|
Binary file (2.34 kB). View file
|
|
|
text/__pycache__/cleaners.cpython-310.pyc
CHANGED
|
Binary files a/text/__pycache__/cleaners.cpython-310.pyc and b/text/__pycache__/cleaners.cpython-310.pyc differ
|
|
|
text/__pycache__/english.cpython-310.pyc
ADDED
|
Binary file (4.69 kB). View file
|
|
|
text/__pycache__/japanese.cpython-310.pyc
CHANGED
|
Binary files a/text/__pycache__/japanese.cpython-310.pyc and b/text/__pycache__/japanese.cpython-310.pyc differ
|
|
|
text/__pycache__/korean.cpython-310.pyc
ADDED
|
Binary file (5.58 kB). View file
|
|
|
text/__pycache__/mandarin.cpython-310.pyc
CHANGED
|
Binary files a/text/__pycache__/mandarin.cpython-310.pyc and b/text/__pycache__/mandarin.cpython-310.pyc differ
|
|
|
text/__pycache__/ngu_dialect.cpython-310.pyc
ADDED
|
Binary file (1.17 kB). View file
|
|
|
text/__pycache__/shanghainese.cpython-310.pyc
ADDED
|
Binary file (2.51 kB). View file
|
|
|
text/cantonese.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import re
|
| 2 |
import cn2an
|
| 3 |
import opencc
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
converter = opencc.OpenCC('jyutjyu')
|
| 7 |
|
| 8 |
# List of (Latin alphabet, ipa) pairs:
|
| 9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
@@ -35,6 +35,16 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
| 35 |
('Z', 'iː˨sɛːt̚˥')
|
| 36 |
]]
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def number_to_cantonese(text):
|
| 40 |
return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
|
|
@@ -47,9 +57,10 @@ def latin_to_ipa(text):
|
|
| 47 |
|
| 48 |
|
| 49 |
def cantonese_to_ipa(text):
|
|
|
|
| 50 |
text = number_to_cantonese(text.upper())
|
| 51 |
-
text = converter.convert(text).replace('-','').replace('$',' ')
|
| 52 |
-
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
|
| 53 |
text = re.sub(r'[、;:]', ',', text)
|
| 54 |
text = re.sub(r'\s*,\s*', ', ', text)
|
| 55 |
text = re.sub(r'\s*。\s*', '. ', text)
|
|
|
|
| 1 |
import re
|
| 2 |
import cn2an
|
| 3 |
import opencc
|
| 4 |
+
import config
|
| 5 |
|
| 6 |
+
converter = opencc.OpenCC(config.ABS_PATH + '/chinese_dialect_lexicons/jyutjyu_2')
|
|
|
|
| 7 |
|
| 8 |
# List of (Latin alphabet, ipa) pairs:
|
| 9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
|
|
| 35 |
('Z', 'iː˨sɛːt̚˥')
|
| 36 |
]]
|
| 37 |
|
| 38 |
+
_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
|
| 39 |
+
('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
|
| 40 |
+
]]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def symbols_to_chinese(text):
|
| 44 |
+
for regex, replacement in _symbols_to_chinese:
|
| 45 |
+
text = re.sub(regex, replacement, text)
|
| 46 |
+
return text
|
| 47 |
+
|
| 48 |
|
| 49 |
def number_to_cantonese(text):
|
| 50 |
return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
def cantonese_to_ipa(text):
|
| 60 |
+
text = symbols_to_chinese(text)
|
| 61 |
text = number_to_cantonese(text.upper())
|
| 62 |
+
text = converter.convert(text).replace('-', '').replace('$', ' ')
|
| 63 |
+
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group()) + ' ', text)
|
| 64 |
text = re.sub(r'[、;:]', ',', text)
|
| 65 |
text = re.sub(r'\s*,\s*', ', ', text)
|
| 66 |
text = re.sub(r'\s*。\s*', '. ', text)
|
text/cleaners.py
CHANGED
|
@@ -1,10 +1,77 @@
|
|
| 1 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def japanese_cleaners(text):
|
| 5 |
from text.japanese import japanese_to_romaji_with_accent
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
return text
|
| 9 |
|
| 10 |
|
|
@@ -15,20 +82,31 @@ def japanese_cleaners2(text):
|
|
| 15 |
def korean_cleaners(text):
|
| 16 |
'''Pipeline for Korean text'''
|
| 17 |
from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
return text
|
| 23 |
|
| 24 |
|
| 25 |
def chinese_cleaners(text):
|
| 26 |
'''Pipeline for Chinese text'''
|
| 27 |
-
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
return text
|
| 33 |
|
| 34 |
|
|
@@ -36,9 +114,9 @@ def zh_ja_mixture_cleaners(text):
|
|
| 36 |
from text.mandarin import chinese_to_romaji
|
| 37 |
from text.japanese import japanese_to_romaji_with_accent
|
| 38 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 39 |
-
lambda x: chinese_to_romaji(x.group(1))+' ', text)
|
| 40 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
| 41 |
-
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
|
| 42 |
text = re.sub(r'\s+$', '', text)
|
| 43 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 44 |
return text
|
|
@@ -57,15 +135,15 @@ def cjks_cleaners(text):
|
|
| 57 |
from text.sanskrit import devanagari_to_ipa
|
| 58 |
from text.english import english_to_lazy_ipa
|
| 59 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 60 |
-
lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
|
| 61 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 62 |
-
lambda x: japanese_to_ipa(x.group(1))+' ', text)
|
| 63 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 64 |
-
lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
|
| 65 |
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
| 66 |
-
lambda x: devanagari_to_ipa(x.group(1))+' ', text)
|
| 67 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 68 |
-
lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
|
| 69 |
text = re.sub(r'\s+$', '', text)
|
| 70 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 71 |
return text
|
|
@@ -77,13 +155,13 @@ def cjke_cleaners(text):
|
|
| 77 |
from text.korean import korean_to_ipa
|
| 78 |
from text.english import english_to_ipa2
|
| 79 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
| 80 |
-
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
|
| 81 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
| 82 |
-
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
|
| 83 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 84 |
-
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
| 85 |
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
| 86 |
-
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
|
| 87 |
text = re.sub(r'\s+$', '', text)
|
| 88 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 89 |
return text
|
|
@@ -95,13 +173,28 @@ def cjke_cleaners2(text):
|
|
| 95 |
from text.korean import korean_to_ipa
|
| 96 |
from text.english import english_to_ipa2
|
| 97 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 98 |
-
lambda x: chinese_to_ipa(x.group(1))+' ', text)
|
| 99 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 100 |
-
lambda x: japanese_to_ipa2(x.group(1))+' ', text)
|
| 101 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 102 |
-
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 104 |
-
lambda x: english_to_ipa2(x.group(1))+' ', text)
|
| 105 |
text = re.sub(r'\s+$', '', text)
|
| 106 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 107 |
return text
|
|
@@ -109,15 +202,25 @@ def cjke_cleaners2(text):
|
|
| 109 |
|
| 110 |
def thai_cleaners(text):
|
| 111 |
from text.thai import num_to_thai, latin_to_thai
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
return text
|
| 115 |
|
| 116 |
|
| 117 |
def shanghainese_cleaners(text):
|
| 118 |
from text.shanghainese import shanghainese_to_ipa
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
return text
|
| 122 |
|
| 123 |
|
|
@@ -129,17 +232,18 @@ def chinese_dialect_cleaners(text):
|
|
| 129 |
from text.english import english_to_lazy_ipa2
|
| 130 |
from text.ngu_dialect import ngu_dialect_to_ipa
|
| 131 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 132 |
-
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
|
| 133 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 134 |
-
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
|
| 135 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
| 136 |
-
|
|
|
|
| 137 |
text = re.sub(r'\[GD\](.*?)\[GD\]',
|
| 138 |
-
lambda x: cantonese_to_ipa(x.group(1))+' ', text)
|
| 139 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 140 |
-
lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
|
| 141 |
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
| 142 |
-
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
|
| 143 |
text = re.sub(r'\s+$', '', text)
|
| 144 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 145 |
return text
|
|
|
|
| 1 |
import re
|
| 2 |
+
import config
|
| 3 |
+
from unidecode import unidecode
|
| 4 |
+
from phonemizer import phonemize
|
| 5 |
+
from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
| 6 |
+
|
| 7 |
+
ESPEAK_LIBRARY = getattr(config, "ESPEAK_LIBRARY", "")
|
| 8 |
+
if ESPEAK_LIBRARY != "":
|
| 9 |
+
EspeakWrapper.set_library(ESPEAK_LIBRARY)
|
| 10 |
+
|
| 11 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
| 12 |
+
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
| 13 |
+
('mrs', 'misess'),
|
| 14 |
+
('mr', 'mister'),
|
| 15 |
+
('dr', 'doctor'),
|
| 16 |
+
('st', 'saint'),
|
| 17 |
+
('co', 'company'),
|
| 18 |
+
('jr', 'junior'),
|
| 19 |
+
('maj', 'major'),
|
| 20 |
+
('gen', 'general'),
|
| 21 |
+
('drs', 'doctors'),
|
| 22 |
+
('rev', 'reverend'),
|
| 23 |
+
('lt', 'lieutenant'),
|
| 24 |
+
('hon', 'honorable'),
|
| 25 |
+
('sgt', 'sergeant'),
|
| 26 |
+
('capt', 'captain'),
|
| 27 |
+
('esq', 'esquire'),
|
| 28 |
+
('ltd', 'limited'),
|
| 29 |
+
('col', 'colonel'),
|
| 30 |
+
('ft', 'fort'),
|
| 31 |
+
]]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def expand_abbreviations(text):
|
| 35 |
+
for regex, replacement in _abbreviations:
|
| 36 |
+
text = re.sub(regex, replacement, text)
|
| 37 |
+
return text
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def transliteration_cleaners(text):
|
| 41 |
+
'''Pipeline for non-English text that transliterates to ASCII.'''
|
| 42 |
+
text = unidecode(text)
|
| 43 |
+
text = text.lower()
|
| 44 |
+
text = re.sub(r'\s+', ' ', text)
|
| 45 |
+
text = expand_abbreviations(text)
|
| 46 |
+
return text
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# for English text
|
| 50 |
+
def english_cleaners(text):
|
| 51 |
+
'''Pipeline for English text, including abbreviation expansion.'''
|
| 52 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: transliteration_cleaners(x.group(1)) + ' ', text)
|
| 53 |
+
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
|
| 54 |
+
return phonemes
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# for non-English text that can be transliterated to ASCII
|
| 58 |
+
def english_cleaners2(text):
|
| 59 |
+
'''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
|
| 60 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: transliteration_cleaners(x.group(1)) + ' ', text)
|
| 61 |
+
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True,
|
| 62 |
+
with_stress=True)
|
| 63 |
+
return phonemes
|
| 64 |
|
| 65 |
|
| 66 |
def japanese_cleaners(text):
|
| 67 |
from text.japanese import japanese_to_romaji_with_accent
|
| 68 |
+
|
| 69 |
+
def clean(text):
|
| 70 |
+
text = japanese_to_romaji_with_accent(text)
|
| 71 |
+
text = re.sub(r'([A-Za-z])$', r'\1.', text)
|
| 72 |
+
return text
|
| 73 |
+
|
| 74 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: clean(x.group(1)) + ' ', text)
|
| 75 |
return text
|
| 76 |
|
| 77 |
|
|
|
|
| 82 |
def korean_cleaners(text):
|
| 83 |
'''Pipeline for Korean text'''
|
| 84 |
from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
|
| 85 |
+
|
| 86 |
+
def clean(text):
|
| 87 |
+
text = latin_to_hangul(text)
|
| 88 |
+
text = number_to_hangul(text)
|
| 89 |
+
text = divide_hangul(text)
|
| 90 |
+
text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
|
| 91 |
+
return text
|
| 92 |
+
|
| 93 |
+
text = re.sub(r'\[KO\](.*?)\[KO\]', lambda x: clean(x.group(1)) + ' ', text)
|
| 94 |
return text
|
| 95 |
|
| 96 |
|
| 97 |
def chinese_cleaners(text):
|
| 98 |
'''Pipeline for Chinese text'''
|
| 99 |
+
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, symbols_to_chinese
|
| 100 |
+
|
| 101 |
+
def clean(text):
|
| 102 |
+
text = symbols_to_chinese(text)
|
| 103 |
+
text = number_to_chinese(text)
|
| 104 |
+
text = chinese_to_bopomofo(text)
|
| 105 |
+
text = latin_to_bopomofo(text)
|
| 106 |
+
text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
|
| 107 |
+
return text
|
| 108 |
+
|
| 109 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: clean(x.group(1)) + ' ', text)
|
| 110 |
return text
|
| 111 |
|
| 112 |
|
|
|
|
| 114 |
from text.mandarin import chinese_to_romaji
|
| 115 |
from text.japanese import japanese_to_romaji_with_accent
|
| 116 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 117 |
+
lambda x: chinese_to_romaji(x.group(1)) + ' ', text)
|
| 118 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
| 119 |
+
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…') + ' ', text)
|
| 120 |
text = re.sub(r'\s+$', '', text)
|
| 121 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 122 |
return text
|
|
|
|
| 135 |
from text.sanskrit import devanagari_to_ipa
|
| 136 |
from text.english import english_to_lazy_ipa
|
| 137 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 138 |
+
lambda x: chinese_to_lazy_ipa(x.group(1)) + ' ', text)
|
| 139 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 140 |
+
lambda x: japanese_to_ipa(x.group(1)) + ' ', text)
|
| 141 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 142 |
+
lambda x: korean_to_lazy_ipa(x.group(1)) + ' ', text)
|
| 143 |
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
| 144 |
+
lambda x: devanagari_to_ipa(x.group(1)) + ' ', text)
|
| 145 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 146 |
+
lambda x: english_to_lazy_ipa(x.group(1)) + ' ', text)
|
| 147 |
text = re.sub(r'\s+$', '', text)
|
| 148 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 149 |
return text
|
|
|
|
| 155 |
from text.korean import korean_to_ipa
|
| 156 |
from text.english import english_to_ipa2
|
| 157 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
| 158 |
+
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
|
| 159 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
| 160 |
+
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
|
| 161 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 162 |
+
lambda x: korean_to_ipa(x.group(1)) + ' ', text)
|
| 163 |
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
| 164 |
+
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
|
| 165 |
text = re.sub(r'\s+$', '', text)
|
| 166 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 167 |
return text
|
|
|
|
| 173 |
from text.korean import korean_to_ipa
|
| 174 |
from text.english import english_to_ipa2
|
| 175 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 176 |
+
lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
|
| 177 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 178 |
+
lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
|
| 179 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
| 180 |
+
lambda x: korean_to_ipa(x.group(1)) + ' ', text)
|
| 181 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 182 |
+
lambda x: english_to_ipa2(x.group(1)) + ' ', text)
|
| 183 |
+
text = re.sub(r'\s+$', '', text)
|
| 184 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 185 |
+
return text
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def cje_cleaners(text):
|
| 189 |
+
from text.mandarin import chinese_to_ipa
|
| 190 |
+
from text.japanese import japanese_to_ipa2
|
| 191 |
+
from text.english import english_to_ipa2
|
| 192 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 193 |
+
lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
|
| 194 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 195 |
+
lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
|
| 196 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 197 |
+
lambda x: english_to_ipa2(x.group(1)) + ' ', text)
|
| 198 |
text = re.sub(r'\s+$', '', text)
|
| 199 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 200 |
return text
|
|
|
|
| 202 |
|
| 203 |
def thai_cleaners(text):
|
| 204 |
from text.thai import num_to_thai, latin_to_thai
|
| 205 |
+
|
| 206 |
+
def clean(text):
|
| 207 |
+
text = num_to_thai(text)
|
| 208 |
+
text = latin_to_thai(text)
|
| 209 |
+
return text
|
| 210 |
+
|
| 211 |
+
text = re.sub(r'\[TH\](.*?)\[TH\]', lambda x: clean(x.group(1)) + ' ', text)
|
| 212 |
return text
|
| 213 |
|
| 214 |
|
| 215 |
def shanghainese_cleaners(text):
|
| 216 |
from text.shanghainese import shanghainese_to_ipa
|
| 217 |
+
|
| 218 |
+
def clean(text):
|
| 219 |
+
text = shanghainese_to_ipa(text)
|
| 220 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 221 |
+
return text
|
| 222 |
+
|
| 223 |
+
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: clean(x.group(1)) + ' ', text)
|
| 224 |
return text
|
| 225 |
|
| 226 |
|
|
|
|
| 232 |
from text.english import english_to_lazy_ipa2
|
| 233 |
from text.ngu_dialect import ngu_dialect_to_ipa
|
| 234 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
| 235 |
+
lambda x: chinese_to_ipa2(x.group(1)) + ' ', text)
|
| 236 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
| 237 |
+
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ') + ' ', text)
|
| 238 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
| 239 |
+
'˧˧˦').replace(
|
| 240 |
+
'6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e') + ' ', text)
|
| 241 |
text = re.sub(r'\[GD\](.*?)\[GD\]',
|
| 242 |
+
lambda x: cantonese_to_ipa(x.group(1)) + ' ', text)
|
| 243 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
| 244 |
+
lambda x: english_to_lazy_ipa2(x.group(1)) + ' ', text)
|
| 245 |
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
| 246 |
+
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ') + ' ', text)
|
| 247 |
text = re.sub(r'\s+$', '', text)
|
| 248 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 249 |
return text
|
text/mandarin.py
CHANGED
|
@@ -7,10 +7,9 @@ import cn2an
|
|
| 7 |
import logging
|
| 8 |
|
| 9 |
logging.getLogger('jieba').setLevel(logging.WARNING)
|
| 10 |
-
jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0]))+'/jieba/dict.txt')
|
| 11 |
jieba.initialize()
|
| 12 |
|
| 13 |
-
|
| 14 |
# List of (Latin alphabet, bopomofo) pairs:
|
| 15 |
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
| 16 |
('a', 'ㄟˉ'),
|
|
@@ -236,9 +235,19 @@ _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
| 236 |
('—', '-')
|
| 237 |
]]
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
def number_to_chinese(text):
|
| 241 |
-
numbers = re.findall(r'
|
| 242 |
for number in numbers:
|
| 243 |
text = text.replace(number, cn2an.an2cn(number), 1)
|
| 244 |
return text
|
|
@@ -286,6 +295,7 @@ def bopomofo_to_ipa2(text):
|
|
| 286 |
|
| 287 |
|
| 288 |
def chinese_to_romaji(text):
|
|
|
|
| 289 |
text = number_to_chinese(text)
|
| 290 |
text = chinese_to_bopomofo(text)
|
| 291 |
text = latin_to_bopomofo(text)
|
|
@@ -306,6 +316,7 @@ def chinese_to_lazy_ipa(text):
|
|
| 306 |
|
| 307 |
|
| 308 |
def chinese_to_ipa(text):
|
|
|
|
| 309 |
text = number_to_chinese(text)
|
| 310 |
text = chinese_to_bopomofo(text)
|
| 311 |
text = latin_to_bopomofo(text)
|
|
@@ -319,6 +330,7 @@ def chinese_to_ipa(text):
|
|
| 319 |
|
| 320 |
|
| 321 |
def chinese_to_ipa2(text):
|
|
|
|
| 322 |
text = number_to_chinese(text)
|
| 323 |
text = chinese_to_bopomofo(text)
|
| 324 |
text = latin_to_bopomofo(text)
|
|
|
|
| 7 |
import logging
|
| 8 |
|
| 9 |
logging.getLogger('jieba').setLevel(logging.WARNING)
|
| 10 |
+
jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0])) + '/jieba/dict.txt')
|
| 11 |
jieba.initialize()
|
| 12 |
|
|
|
|
| 13 |
# List of (Latin alphabet, bopomofo) pairs:
|
| 14 |
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
| 15 |
('a', 'ㄟˉ'),
|
|
|
|
| 235 |
('—', '-')
|
| 236 |
]]
|
| 237 |
|
| 238 |
+
_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
|
| 239 |
+
('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
|
| 240 |
+
]]
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def symbols_to_chinese(text):
|
| 244 |
+
for regex, replacement in _symbols_to_chinese:
|
| 245 |
+
text = re.sub(regex, replacement, text)
|
| 246 |
+
return text
|
| 247 |
+
|
| 248 |
|
| 249 |
def number_to_chinese(text):
|
| 250 |
+
numbers = re.findall(r'[0-9]+(?:\.?[0-9]+)?', text)
|
| 251 |
for number in numbers:
|
| 252 |
text = text.replace(number, cn2an.an2cn(number), 1)
|
| 253 |
return text
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
def chinese_to_romaji(text):
|
| 298 |
+
text = symbols_to_chinese(text)
|
| 299 |
text = number_to_chinese(text)
|
| 300 |
text = chinese_to_bopomofo(text)
|
| 301 |
text = latin_to_bopomofo(text)
|
|
|
|
| 316 |
|
| 317 |
|
| 318 |
def chinese_to_ipa(text):
|
| 319 |
+
text = symbols_to_chinese(text)
|
| 320 |
text = number_to_chinese(text)
|
| 321 |
text = chinese_to_bopomofo(text)
|
| 322 |
text = latin_to_bopomofo(text)
|
|
|
|
| 330 |
|
| 331 |
|
| 332 |
def chinese_to_ipa2(text):
|
| 333 |
+
text = symbols_to_chinese(text)
|
| 334 |
text = number_to_chinese(text)
|
| 335 |
text = chinese_to_bopomofo(text)
|
| 336 |
text = latin_to_bopomofo(text)
|
text/shanghainese.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import re
|
| 2 |
import cn2an
|
| 3 |
import opencc
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
converter = opencc.OpenCC('zaonhe')
|
| 7 |
|
| 8 |
# List of (Latin alphabet, ipa) pairs:
|
| 9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
@@ -35,9 +35,19 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
| 35 |
('Z', 'zᴇ')
|
| 36 |
]]
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def _number_to_shanghainese(num):
|
| 40 |
-
num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
|
| 41 |
return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
|
| 42 |
|
| 43 |
|
|
@@ -52,9 +62,10 @@ def latin_to_ipa(text):
|
|
| 52 |
|
| 53 |
|
| 54 |
def shanghainese_to_ipa(text):
|
|
|
|
| 55 |
text = number_to_shanghainese(text.upper())
|
| 56 |
-
text = converter.convert(text).replace('-','').replace('$',' ')
|
| 57 |
-
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
|
| 58 |
text = re.sub(r'[、;:]', ',', text)
|
| 59 |
text = re.sub(r'\s*,\s*', ', ', text)
|
| 60 |
text = re.sub(r'\s*。\s*', '. ', text)
|
|
|
|
| 1 |
import re
|
| 2 |
import cn2an
|
| 3 |
import opencc
|
| 4 |
+
import config
|
| 5 |
|
| 6 |
+
converter = opencc.OpenCC(config.ABS_PATH + '/chinese_dialect_lexicons/zaonhe')
|
|
|
|
| 7 |
|
| 8 |
# List of (Latin alphabet, ipa) pairs:
|
| 9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
|
|
| 35 |
('Z', 'zᴇ')
|
| 36 |
]]
|
| 37 |
|
| 38 |
+
_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
|
| 39 |
+
('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
|
| 40 |
+
]]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def symbols_to_chinese(text):
|
| 44 |
+
for regex, replacement in _symbols_to_chinese:
|
| 45 |
+
text = re.sub(regex, replacement, text)
|
| 46 |
+
return text
|
| 47 |
+
|
| 48 |
|
| 49 |
def _number_to_shanghainese(num):
|
| 50 |
+
num = cn2an.an2cn(num).replace('一十', '十').replace('二十', '廿').replace('二', '两')
|
| 51 |
return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
|
| 52 |
|
| 53 |
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
def shanghainese_to_ipa(text):
|
| 65 |
+
text = symbols_to_chinese(text)
|
| 66 |
text = number_to_shanghainese(text.upper())
|
| 67 |
+
text = converter.convert(text).replace('-', '').replace('$', ' ')
|
| 68 |
+
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group()) + ' ', text)
|
| 69 |
text = re.sub(r'[、;:]', ',', text)
|
| 70 |
text = re.sub(r'\s*,\s*', ', ', text)
|
| 71 |
text = re.sub(r'\s*。\s*', '. ', text)
|
utils/__pycache__/merge.cpython-310.pyc
ADDED
|
Binary file (3.95 kB). View file
|
|
|
utils/__pycache__/nlp.cpython-310.pyc
ADDED
|
Binary file (2.41 kB). View file
|
|
|
utils/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (4.02 kB). View file
|
|
|
utils/merge.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import config
|
| 5 |
+
import numpy as np
|
| 6 |
+
from utils.utils import check_is_none
|
| 7 |
+
from voice import vits, TTS
|
| 8 |
+
|
| 9 |
+
lang_dict = {
|
| 10 |
+
"english_cleaners": ["en"],
|
| 11 |
+
"english_cleaners2": ["en"],
|
| 12 |
+
"japanese_cleaners": ["ja"],
|
| 13 |
+
"japanese_cleaners2": ["ja"],
|
| 14 |
+
"korean_cleaners": ["ko"],
|
| 15 |
+
"chinese_cleaners": ["zh"],
|
| 16 |
+
"zh_ja_mixture_cleaners": ["zh", "ja"],
|
| 17 |
+
"sanskrit_cleaners": ["sa"],
|
| 18 |
+
"cjks_cleaners": ["zh", "ja", "ko", "sa"],
|
| 19 |
+
"cjke_cleaners": ["zh", "ja", "ko", "en"],
|
| 20 |
+
"cjke_cleaners2": ["zh", "ja", "ko", "en"],
|
| 21 |
+
"cje_cleaners": ["zh", "ja", "en"],
|
| 22 |
+
"thai_cleaners": ["th"],
|
| 23 |
+
"shanghainese_cleaners": ["sh"],
|
| 24 |
+
"chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
|
| 25 |
+
"ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
|
| 26 |
+
"YB"],
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def analysis(model_config_json):
|
| 31 |
+
model_config = json.load(model_config_json)
|
| 32 |
+
symbols = model_config.get("symbols", None)
|
| 33 |
+
emotion_embedding = model_config.get("data").get("emotion_embedding", False)
|
| 34 |
+
if symbols != None:
|
| 35 |
+
if not emotion_embedding:
|
| 36 |
+
mode_type = "vits"
|
| 37 |
+
else:
|
| 38 |
+
mode_type = "w2v2"
|
| 39 |
+
else:
|
| 40 |
+
mode_type = "hubert-soft"
|
| 41 |
+
return mode_type
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def load_npy(model_):
|
| 45 |
+
if isinstance(model_, list):
|
| 46 |
+
# check if is .npy
|
| 47 |
+
for i in model_:
|
| 48 |
+
_model_extention = os.path.splitext(i)[1]
|
| 49 |
+
if _model_extention != ".npy":
|
| 50 |
+
raise ValueError(f"Unsupported model type: {_model_extention}")
|
| 51 |
+
|
| 52 |
+
# merge npy files
|
| 53 |
+
emotion_reference = np.empty((0, 1024))
|
| 54 |
+
for i in model_:
|
| 55 |
+
tmp = np.load(i).reshape(-1, 1024)
|
| 56 |
+
emotion_reference = np.append(emotion_reference, tmp, axis=0)
|
| 57 |
+
|
| 58 |
+
elif os.path.isdir(model_):
|
| 59 |
+
emotion_reference = np.empty((0, 1024))
|
| 60 |
+
for root, dirs, files in os.walk(model_):
|
| 61 |
+
for file_name in files:
|
| 62 |
+
# check if is .npy
|
| 63 |
+
_model_extention = os.path.splitext(file_name)[1]
|
| 64 |
+
if _model_extention != ".npy":
|
| 65 |
+
continue
|
| 66 |
+
file_path = os.path.join(root, file_name)
|
| 67 |
+
|
| 68 |
+
# merge npy files
|
| 69 |
+
tmp = np.load(file_path).reshape(-1, 1024)
|
| 70 |
+
emotion_reference = np.append(emotion_reference, tmp, axis=0)
|
| 71 |
+
|
| 72 |
+
elif os.path.isfile(model_):
|
| 73 |
+
# check if is .npy
|
| 74 |
+
_model_extention = os.path.splitext(model_)[1]
|
| 75 |
+
if _model_extention != ".npy":
|
| 76 |
+
raise ValueError(f"Unsupported model type: {_model_extention}")
|
| 77 |
+
|
| 78 |
+
emotion_reference = np.load(model_)
|
| 79 |
+
logging.info(f"Loaded emotional dimention npy range:{len(emotion_reference)}")
|
| 80 |
+
return emotion_reference
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def merge_model(merging_model):
|
| 84 |
+
vits_obj = []
|
| 85 |
+
vits_speakers = []
|
| 86 |
+
hubert_vits_obj = []
|
| 87 |
+
hubert_vits_speakers = []
|
| 88 |
+
w2v2_vits_obj = []
|
| 89 |
+
w2v2_vits_speakers = []
|
| 90 |
+
|
| 91 |
+
# model list
|
| 92 |
+
vits_list = []
|
| 93 |
+
hubert_vits_list = []
|
| 94 |
+
w2v2_vits_list = []
|
| 95 |
+
|
| 96 |
+
for l in merging_model:
|
| 97 |
+
with open(l[1], 'r', encoding='utf-8') as model_config:
|
| 98 |
+
model_type = analysis(model_config)
|
| 99 |
+
if model_type == "vits":
|
| 100 |
+
vits_list.append(l)
|
| 101 |
+
elif model_type == "hubert":
|
| 102 |
+
hubert_vits_list.append(l)
|
| 103 |
+
elif model_type == "w2v2":
|
| 104 |
+
w2v2_vits_list.append(l)
|
| 105 |
+
|
| 106 |
+
# merge vits
|
| 107 |
+
new_id = 0
|
| 108 |
+
for obj_id, i in enumerate(vits_list):
|
| 109 |
+
obj = vits(model=i[0], config=i[1], model_type="vits")
|
| 110 |
+
lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
|
| 111 |
+
|
| 112 |
+
for id, name in enumerate(obj.return_speakers()):
|
| 113 |
+
vits_obj.append([int(id), obj, obj_id])
|
| 114 |
+
vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
| 115 |
+
new_id += 1
|
| 116 |
+
|
| 117 |
+
# merge hubert-vits
|
| 118 |
+
if len(hubert_vits_list) != 0:
|
| 119 |
+
if getattr(config, "HUBERT_SOFT_MODEL", None) == None or check_is_none(config.HUBERT_SOFT_MODEL):
|
| 120 |
+
raise ValueError(f"Please configure HUBERT_SOFT_MODEL path in config.py")
|
| 121 |
+
try:
|
| 122 |
+
from hubert_model import hubert_soft
|
| 123 |
+
hubert = hubert_soft(config.HUBERT_SOFT_MODEL)
|
| 124 |
+
except Exception as e:
|
| 125 |
+
raise ValueError(f"Load HUBERT_SOFT_MODEL failed {e}")
|
| 126 |
+
|
| 127 |
+
new_id = 0
|
| 128 |
+
for obj_id, i in enumerate(hubert_vits_list):
|
| 129 |
+
obj = vits(model=i[0], config=i[1], model_=hubert, model_type="hubert")
|
| 130 |
+
lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
|
| 131 |
+
|
| 132 |
+
for id, name in enumerate(obj.return_speakers()):
|
| 133 |
+
hubert_vits_obj.append([int(id), obj, obj_id])
|
| 134 |
+
hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
| 135 |
+
new_id += 1
|
| 136 |
+
|
| 137 |
+
# merge w2v2-vits
|
| 138 |
+
if len(w2v2_vits_list) != 0:
|
| 139 |
+
if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
|
| 140 |
+
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
|
| 141 |
+
try:
|
| 142 |
+
emotion_reference = load_npy(config.DIMENSIONAL_EMOTION_NPY)
|
| 143 |
+
except Exception as e:
|
| 144 |
+
raise ValueError(f"Load DIMENSIONAL_EMOTION_NPY failed {e}")
|
| 145 |
+
|
| 146 |
+
new_id = 0
|
| 147 |
+
for obj_id, i in enumerate(w2v2_vits_list):
|
| 148 |
+
obj = vits(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2")
|
| 149 |
+
lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
|
| 150 |
+
|
| 151 |
+
for id, name in enumerate(obj.return_speakers()):
|
| 152 |
+
w2v2_vits_obj.append([int(id), obj, obj_id])
|
| 153 |
+
w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
| 154 |
+
new_id += 1
|
| 155 |
+
|
| 156 |
+
voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
|
| 157 |
+
voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
|
| 158 |
+
|
| 159 |
+
tts = TTS(voice_obj, voice_speakers)
|
| 160 |
+
|
| 161 |
+
return tts
|
utils/nlp.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import regex as re
|
| 2 |
+
import logging
|
| 3 |
+
import config
|
| 4 |
+
from fastlid import fastlid
|
| 5 |
+
from .utils import check_is_none
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger("vits-simple-api")
|
| 8 |
+
level = getattr(config, "LOGGING_LEVEL", "DEBUG")
|
| 9 |
+
level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
|
| 10 |
+
'CRITICAL': logging.CRITICAL}
|
| 11 |
+
logger.setLevel(level_dict[level])
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def clasify_lang(text):
|
| 15 |
+
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
|
| 16 |
+
r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
|
| 17 |
+
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
|
| 18 |
+
words = re.split(pattern, text)
|
| 19 |
+
|
| 20 |
+
pre = ""
|
| 21 |
+
p = 0
|
| 22 |
+
for word in words:
|
| 23 |
+
|
| 24 |
+
if check_is_none(word): continue
|
| 25 |
+
lang = fastlid(word)[0]
|
| 26 |
+
if pre == "":
|
| 27 |
+
text = text[:p] + text[p:].replace(word, f'[{lang.upper()}]' + word, 1)
|
| 28 |
+
p += len(f'[{lang.upper()}]')
|
| 29 |
+
elif pre != lang:
|
| 30 |
+
text = text[:p] + text[p:].replace(word, f'[{pre.upper()}][{lang.upper()}]' + word, 1)
|
| 31 |
+
p += len(f'[{pre.upper()}][{lang.upper()}]')
|
| 32 |
+
pre = lang
|
| 33 |
+
p += text[p:].index(word) + len(word)
|
| 34 |
+
text += f"[{pre.upper()}]"
|
| 35 |
+
|
| 36 |
+
return text
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def cut(text, max):
|
| 40 |
+
pattern = r'[\!\(\)\,\-\.\/\:\;\?\?\。\,\、\;\:]+'
|
| 41 |
+
sentences = re.split(pattern, text)
|
| 42 |
+
sentence_list = []
|
| 43 |
+
count = 0
|
| 44 |
+
p = 0
|
| 45 |
+
for sentence in sentences:
|
| 46 |
+
count += len(sentence) + 1
|
| 47 |
+
if count >= max:
|
| 48 |
+
sentence_list.append(text[p:p + count])
|
| 49 |
+
p += count
|
| 50 |
+
count = 0
|
| 51 |
+
if p < len(text):
|
| 52 |
+
sentence_list.append(text[p:])
|
| 53 |
+
return sentence_list
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def sentence_split(text, max=50, lang="auto", speaker_lang=None):
|
| 57 |
+
# 如果该speaker只支持一种语言
|
| 58 |
+
if speaker_lang is not None and len(speaker_lang) == 1:
|
| 59 |
+
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
|
| 60 |
+
logger.debug(
|
| 61 |
+
f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
|
| 62 |
+
lang = speaker_lang[0]
|
| 63 |
+
else:
|
| 64 |
+
fastlid.set_languages = speaker_lang
|
| 65 |
+
|
| 66 |
+
sentence_list = []
|
| 67 |
+
if lang.upper() != "MIX":
|
| 68 |
+
if max <= 0:
|
| 69 |
+
sentence_list.append(
|
| 70 |
+
clasify_lang(text) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
|
| 71 |
+
else:
|
| 72 |
+
for i in cut(text, max):
|
| 73 |
+
if check_is_none(i): continue
|
| 74 |
+
sentence_list.append(
|
| 75 |
+
clasify_lang(i) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
|
| 76 |
+
else:
|
| 77 |
+
sentence_list.append(text)
|
| 78 |
+
|
| 79 |
+
for i in sentence_list:
|
| 80 |
+
logger.debug(i)
|
| 81 |
+
|
| 82 |
+
return sentence_list
|
utils/utils.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from json import loads
|
| 4 |
+
import av
|
| 5 |
+
from torch import load, FloatTensor
|
| 6 |
+
from numpy import float32
|
| 7 |
+
import librosa
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class HParams():
|
| 11 |
+
def __init__(self, **kwargs):
|
| 12 |
+
for k, v in kwargs.items():
|
| 13 |
+
if type(v) == dict:
|
| 14 |
+
v = HParams(**v)
|
| 15 |
+
self[k] = v
|
| 16 |
+
|
| 17 |
+
def keys(self):
|
| 18 |
+
return self.__dict__.keys()
|
| 19 |
+
|
| 20 |
+
def items(self):
|
| 21 |
+
return self.__dict__.items()
|
| 22 |
+
|
| 23 |
+
def values(self):
|
| 24 |
+
return self.__dict__.values()
|
| 25 |
+
|
| 26 |
+
def __len__(self):
|
| 27 |
+
return len(self.__dict__)
|
| 28 |
+
|
| 29 |
+
def __getitem__(self, key):
|
| 30 |
+
return getattr(self, key)
|
| 31 |
+
|
| 32 |
+
def __setitem__(self, key, value):
|
| 33 |
+
return setattr(self, key, value)
|
| 34 |
+
|
| 35 |
+
def __contains__(self, key):
|
| 36 |
+
return key in self.__dict__
|
| 37 |
+
|
| 38 |
+
def __repr__(self):
|
| 39 |
+
return self.__dict__.__repr__()
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def load_checkpoint(checkpoint_path, model):
|
| 43 |
+
checkpoint_dict = load(checkpoint_path, map_location='cpu')
|
| 44 |
+
iteration = checkpoint_dict['iteration']
|
| 45 |
+
saved_state_dict = checkpoint_dict['model']
|
| 46 |
+
if hasattr(model, 'module'):
|
| 47 |
+
state_dict = model.module.state_dict()
|
| 48 |
+
else:
|
| 49 |
+
state_dict = model.state_dict()
|
| 50 |
+
new_state_dict = {}
|
| 51 |
+
for k, v in state_dict.items():
|
| 52 |
+
try:
|
| 53 |
+
new_state_dict[k] = saved_state_dict[k]
|
| 54 |
+
except:
|
| 55 |
+
logging.info("%s is not in the checkpoint" % k)
|
| 56 |
+
new_state_dict[k] = v
|
| 57 |
+
if hasattr(model, 'module'):
|
| 58 |
+
model.module.load_state_dict(new_state_dict)
|
| 59 |
+
else:
|
| 60 |
+
model.load_state_dict(new_state_dict)
|
| 61 |
+
logging.info("Loaded checkpoint '{}' (iteration {})".format(
|
| 62 |
+
checkpoint_path, iteration))
|
| 63 |
+
return
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_hparams_from_file(config_path):
|
| 67 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
| 68 |
+
data = f.read()
|
| 69 |
+
config = loads(data)
|
| 70 |
+
|
| 71 |
+
hparams = HParams(**config)
|
| 72 |
+
return hparams
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def load_audio_to_torch(full_path, target_sampling_rate):
|
| 76 |
+
audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True)
|
| 77 |
+
return FloatTensor(audio.astype(float32))
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def wav2ogg(input, output):
|
| 81 |
+
with av.open(input, 'rb') as i:
|
| 82 |
+
with av.open(output, 'wb', format='ogg') as o:
|
| 83 |
+
out_stream = o.add_stream('libvorbis')
|
| 84 |
+
for frame in i.decode(audio=0):
|
| 85 |
+
for p in out_stream.encode(frame):
|
| 86 |
+
o.mux(p)
|
| 87 |
+
|
| 88 |
+
for p in out_stream.encode(None):
|
| 89 |
+
o.mux(p)
|
| 90 |
+
|
| 91 |
+
def wav2mp3(input, output):
|
| 92 |
+
with av.open(input, 'rb') as i:
|
| 93 |
+
with av.open(output, 'wb', format='mp3') as o:
|
| 94 |
+
out_stream = o.add_stream('mp3')
|
| 95 |
+
for frame in i.decode(audio=0):
|
| 96 |
+
for p in out_stream.encode(frame):
|
| 97 |
+
o.mux(p)
|
| 98 |
+
|
| 99 |
+
for p in out_stream.encode(None):
|
| 100 |
+
o.mux(p)
|
| 101 |
+
|
| 102 |
+
def clean_folder(folder_path):
|
| 103 |
+
for filename in os.listdir(folder_path):
|
| 104 |
+
file_path = os.path.join(folder_path, filename)
|
| 105 |
+
# 如果是文件,则删除文件
|
| 106 |
+
if os.path.isfile(file_path):
|
| 107 |
+
os.remove(file_path)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# is none -> True, is not none -> False
|
| 111 |
+
def check_is_none(s):
|
| 112 |
+
return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
|
vits-simple-api-installer-latest.sh
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
INSTALL_DIR=/usr/local/vits-simple-api
|
| 2 |
+
|
| 3 |
+
RED='\033[0;31m'
|
| 4 |
+
GREEN='\033[0;32m'
|
| 5 |
+
YELLOW='\033[0;33m'
|
| 6 |
+
PLAIN='\033[0m'
|
| 7 |
+
|
| 8 |
+
mkdir -p $INSTALL_DIR
|
| 9 |
+
cd $INSTALL_DIR
|
| 10 |
+
if [ ! -f config.py ]; then
|
| 11 |
+
echo -e "${YELLOW}download config.py\n${PLAIN}"
|
| 12 |
+
wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose.yaml
|
| 16 |
+
|
| 17 |
+
echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
|
| 18 |
+
|
| 19 |
+
docker compose pull
|
| 20 |
+
docker compose up -d
|
| 21 |
+
|
| 22 |
+
echo -e "\nThe upgrade or installation has been completed."
|
| 23 |
+
echo -e "The configuration file directory is $(realpath $INSTALL_DIR)"
|
| 24 |
+
echo -e "${YELLOW}If the vits model is not imported, it cannot be used. Import the model in the configuration file directory.${PLAIN}"
|
| 25 |
+
echo -e "After modifying the configuration file, restart the docker container for the modification to take effect."
|
| 26 |
+
echo -e "${YELLOW}If you have any questions, please put them in the issues.${PLAIN}"
|
| 27 |
+
echo -e "https://github.com/Artrajz/vits-simple-api"
|
voice.py
CHANGED
|
@@ -1,32 +1,30 @@
|
|
| 1 |
import os
|
| 2 |
-
|
| 3 |
import librosa
|
| 4 |
-
from scipy.io.wavfile import write
|
| 5 |
-
from mel_processing import spectrogram_torch
|
| 6 |
-
from text import text_to_sequence, _clean_text
|
| 7 |
-
from models import SynthesizerTrn
|
| 8 |
-
import utils
|
| 9 |
import commons
|
| 10 |
import sys
|
| 11 |
import re
|
| 12 |
import numpy as np
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
from torch import no_grad, LongTensor, inference_mode, FloatTensor
|
| 16 |
-
import audonnx
|
| 17 |
-
import uuid
|
| 18 |
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
class Voice:
|
| 22 |
-
def __init__(self, model, config, out_path=None):
|
| 23 |
-
self.out_path = out_path
|
| 24 |
-
if not os.path.exists(self.out_path):
|
| 25 |
-
try:
|
| 26 |
-
os.mkdir(self.out_path)
|
| 27 |
-
except:
|
| 28 |
-
pass
|
| 29 |
|
|
|
|
|
|
|
|
|
|
| 30 |
self.hps_ms = utils.get_hparams_from_file(config)
|
| 31 |
self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0
|
| 32 |
self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0
|
|
@@ -42,9 +40,19 @@ class Voice:
|
|
| 42 |
emotion_embedding=self.emotion_embedding,
|
| 43 |
**self.hps_ms.model)
|
| 44 |
_ = self.net_g_ms.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
utils.load_checkpoint(model, self.net_g_ms)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
def
|
| 48 |
if cleaned:
|
| 49 |
text_norm = text_to_sequence(text, hps.symbols, [])
|
| 50 |
else:
|
|
@@ -54,7 +62,7 @@ class Voice:
|
|
| 54 |
text_norm = LongTensor(text_norm)
|
| 55 |
return text_norm
|
| 56 |
|
| 57 |
-
def get_label_value(self,
|
| 58 |
value = re.search(rf'\[{label}=(.+?)\]', text)
|
| 59 |
if value:
|
| 60 |
try:
|
|
@@ -65,16 +73,10 @@ class Voice:
|
|
| 65 |
sys.exit(1)
|
| 66 |
else:
|
| 67 |
value = default
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
def ex_return(self, text, escape=False):
|
| 71 |
-
if escape:
|
| 72 |
-
return text.encode('unicode_escape').decode()
|
| 73 |
else:
|
| 74 |
-
return text
|
| 75 |
-
|
| 76 |
-
def return_speakers(self, escape=False):
|
| 77 |
-
return self.speakers
|
| 78 |
|
| 79 |
def get_label(self, text, label):
|
| 80 |
if f'[{label}]' in text:
|
|
@@ -82,132 +84,152 @@ class Voice:
|
|
| 82 |
else:
|
| 83 |
return False, text
|
| 84 |
|
| 85 |
-
def
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
x_tst_lengths = LongTensor([stn_tst.size(0)])
|
| 98 |
-
sid = LongTensor([speaker_id])
|
| 99 |
-
audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid,
|
| 100 |
-
noise_scale=noise_scale,
|
| 101 |
-
noise_scale_w=noise_scale_w,
|
| 102 |
-
length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
# else:
|
| 105 |
-
#
|
| 106 |
-
#
|
| 107 |
-
#
|
| 108 |
-
#
|
| 109 |
-
#
|
| 110 |
-
#
|
| 111 |
-
#
|
| 112 |
-
#
|
| 113 |
-
|
| 114 |
-
# text, 'NOISE', 0.667, 'noise scale')
|
| 115 |
-
# noise_scale_w, text = self.get_label_value(
|
| 116 |
-
# text, 'NOISEW', 0.8, 'deviation of noise')
|
| 117 |
-
# cleaned, text = self.get_label(text, 'CLEANED')
|
| 118 |
-
#
|
| 119 |
-
# stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned)
|
| 120 |
-
#
|
| 121 |
-
# emotion_reference = input('Path of an emotion reference: ')
|
| 122 |
-
# if emotion_reference.endswith('.npy'):
|
| 123 |
-
# emotion = np.load(emotion_reference)
|
| 124 |
-
# emotion = FloatTensor(emotion).unsqueeze(0)
|
| 125 |
-
# else:
|
| 126 |
-
# audio16000, sampling_rate = librosa.load(
|
| 127 |
-
# emotion_reference, sr=16000, mono=True)
|
| 128 |
-
# emotion = w2v2_model(audio16000, sampling_rate)[
|
| 129 |
-
# 'hidden_states']
|
| 130 |
-
# emotion_reference = re.sub(
|
| 131 |
-
# r'\..*$', '', emotion_reference)
|
| 132 |
-
# np.save(emotion_reference, emotion.squeeze(0))
|
| 133 |
-
# emotion = FloatTensor(emotion)
|
| 134 |
-
#
|
| 135 |
-
#
|
| 136 |
-
# with no_grad():
|
| 137 |
-
# x_tst = stn_tst.unsqueeze(0)
|
| 138 |
-
# x_tst_lengths = LongTensor([stn_tst.size(0)])
|
| 139 |
-
# sid = LongTensor([speaker_id])
|
| 140 |
-
# audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
| 141 |
-
# noise_scale_w=noise_scale_w,
|
| 142 |
-
# length_scale=length_scale, emotion_embedding=emotion)[0][
|
| 143 |
-
# 0, 0].data.cpu().float().numpy()
|
| 144 |
-
|
| 145 |
-
# else:
|
| 146 |
-
# model = input('Path of a hubert-soft Model: ')
|
| 147 |
-
# from hubert_model import hubert_soft
|
| 148 |
-
# hubert = hubert_soft(model)
|
| 149 |
-
|
| 150 |
-
# if audio_path != '[VC]':
|
| 151 |
-
# if self.use_f0:
|
| 152 |
-
# audio, sampling_rate = librosa.load(
|
| 153 |
-
# audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
|
| 154 |
-
# audio16000 = librosa.resample(
|
| 155 |
-
# audio, orig_sr=sampling_rate, target_sr=16000)
|
| 156 |
-
# else:
|
| 157 |
-
# audio16000, sampling_rate = librosa.load(
|
| 158 |
-
# audio_path, sr=16000, mono=True)
|
| 159 |
-
#
|
| 160 |
-
# out_path = "H:/git/MoeGoe-Simple-API/upload/hubert.wav"
|
| 161 |
-
# length_scale, out_path = self.get_label_value(
|
| 162 |
-
# out_path, 'LENGTH', 1, 'length scale')
|
| 163 |
-
# noise_scale, out_path = self.get_label_value(
|
| 164 |
-
# out_path, 'NOISE', 0.1, 'noise scale')
|
| 165 |
-
# noise_scale_w, out_path = self.get_label_value(
|
| 166 |
-
# out_path, 'NOISEW', 0.1, 'deviation of noise')
|
| 167 |
-
#
|
| 168 |
-
# with inference_mode():
|
| 169 |
-
# units = hubert.units(FloatTensor(audio16000).unsqueeze(
|
| 170 |
-
# 0).unsqueeze(0)).squeeze(0).numpy()
|
| 171 |
-
# if self.use_f0:
|
| 172 |
-
# f0_scale, out_path = self.get_label_value(
|
| 173 |
-
# out_path, 'F0', 1, 'f0 scale')
|
| 174 |
-
# f0 = librosa.pyin(audio, sr=sampling_rate,
|
| 175 |
-
# fmin=librosa.note_to_hz('C0'),
|
| 176 |
-
# fmax=librosa.note_to_hz('C7'),
|
| 177 |
-
# frame_length=1780)[0]
|
| 178 |
-
# target_length = len(units[:, 0])
|
| 179 |
-
# f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
|
| 180 |
-
# np.arange(0, len(f0)), f0)) * f0_scale
|
| 181 |
-
# units[:, 0] = f0 / 10
|
| 182 |
-
#
|
| 183 |
-
# stn_tst = FloatTensor(units)
|
| 184 |
-
# with no_grad():
|
| 185 |
-
# x_tst = stn_tst.unsqueeze(0)
|
| 186 |
-
# x_tst_lengths = LongTensor([stn_tst.size(0)])
|
| 187 |
-
# sid = LongTensor([target_id])
|
| 188 |
-
# audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
| 189 |
-
# noise_scale_w=noise_scale_w, length_scale=length_scale)[0][
|
| 190 |
-
# 0, 0].data.float().numpy()
|
| 191 |
|
| 192 |
-
with BytesIO() as f:
|
| 193 |
-
fname = str(uuid.uuid1())
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
file_path = self.out_path + "/" + fname + ".wav"
|
| 202 |
-
write(file_path, 24000, audio)
|
| 203 |
-
silk_path = utils.convert_to_silk(file_path)
|
| 204 |
-
os.remove(file_path)
|
| 205 |
-
return silk_path, "audio/silk", fname + ".silk"
|
| 206 |
else:
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
audio = utils.load_audio_to_torch(
|
| 213 |
audio_path, self.hps_ms.data.sampling_rate)
|
|
@@ -223,9 +245,242 @@ class Voice:
|
|
| 223 |
|
| 224 |
with no_grad():
|
| 225 |
sid_tgt = LongTensor([target_id])
|
| 226 |
-
audio = self.net_g_ms.voice_conversion(spec,
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
with BytesIO() as f:
|
| 230 |
-
write(f,
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import librosa
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import commons
|
| 4 |
import sys
|
| 5 |
import re
|
| 6 |
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
+
import xml.etree.ElementTree as ET
|
| 9 |
+
import config
|
| 10 |
+
import logging
|
| 11 |
from torch import no_grad, LongTensor, inference_mode, FloatTensor
|
|
|
|
|
|
|
| 12 |
from io import BytesIO
|
| 13 |
+
from graiax import silkcoder
|
| 14 |
+
from utils.nlp import cut, sentence_split
|
| 15 |
+
from scipy.io.wavfile import write
|
| 16 |
+
from mel_processing import spectrogram_torch
|
| 17 |
+
from text import text_to_sequence, _clean_text
|
| 18 |
+
from models import SynthesizerTrn
|
| 19 |
+
from utils import utils
|
| 20 |
|
| 21 |
+
# torch.set_num_threads(1) # 设置torch线程为1
|
| 22 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
class vits:
|
| 26 |
+
def __init__(self, model, config, model_=None, model_type=None):
|
| 27 |
+
self.model_type = model_type
|
| 28 |
self.hps_ms = utils.get_hparams_from_file(config)
|
| 29 |
self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0
|
| 30 |
self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0
|
|
|
|
| 40 |
emotion_embedding=self.emotion_embedding,
|
| 41 |
**self.hps_ms.model)
|
| 42 |
_ = self.net_g_ms.eval()
|
| 43 |
+
|
| 44 |
+
# load model
|
| 45 |
+
self.load_model(model, model_)
|
| 46 |
+
|
| 47 |
+
def load_model(self, model, model_=None):
|
| 48 |
utils.load_checkpoint(model, self.net_g_ms)
|
| 49 |
+
self.net_g_ms.to(device)
|
| 50 |
+
if self.model_type == "hubert":
|
| 51 |
+
self.hubert = model_
|
| 52 |
+
elif self.model_type == "w2v2":
|
| 53 |
+
self.emotion_reference = model_
|
| 54 |
|
| 55 |
+
def get_cleaned_text(self, text, hps, cleaned=False):
|
| 56 |
if cleaned:
|
| 57 |
text_norm = text_to_sequence(text, hps.symbols, [])
|
| 58 |
else:
|
|
|
|
| 62 |
text_norm = LongTensor(text_norm)
|
| 63 |
return text_norm
|
| 64 |
|
| 65 |
+
def get_label_value(self, label, default, warning_name='value', text=""):
|
| 66 |
value = re.search(rf'\[{label}=(.+?)\]', text)
|
| 67 |
if value:
|
| 68 |
try:
|
|
|
|
| 73 |
sys.exit(1)
|
| 74 |
else:
|
| 75 |
value = default
|
| 76 |
+
if text == "":
|
| 77 |
+
return value
|
|
|
|
|
|
|
|
|
|
| 78 |
else:
|
| 79 |
+
return value, text
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
def get_label(self, text, label):
|
| 82 |
if f'[{label}]' in text:
|
|
|
|
| 84 |
else:
|
| 85 |
return False, text
|
| 86 |
|
| 87 |
+
def get_cleaner(self):
|
| 88 |
+
return getattr(self.hps_ms.data, 'text_cleaners', [None])[0]
|
| 89 |
+
|
| 90 |
+
def return_speakers(self, escape=False):
|
| 91 |
+
return self.speakers
|
| 92 |
+
|
| 93 |
+
def infer(self, params):
|
| 94 |
+
emotion = params.get("emotion", None)
|
| 95 |
+
|
| 96 |
+
with no_grad():
|
| 97 |
+
x_tst = params.get("stn_tst").unsqueeze(0)
|
| 98 |
+
x_tst_lengths = LongTensor([params.get("stn_tst").size(0)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
audio = self.net_g_ms.infer(x_tst.to(device), x_tst_lengths.to(device), sid=params.get("sid").to(device),
|
| 101 |
+
noise_scale=params.get("noise_scale"),
|
| 102 |
+
noise_scale_w=params.get("noise_scale_w"),
|
| 103 |
+
length_scale=params.get("length_scale"),
|
| 104 |
+
emotion_embedding=emotion.to(device) if emotion != None else None)[0][
|
| 105 |
+
0, 0].data.float().cpu().numpy()
|
| 106 |
+
|
| 107 |
+
torch.cuda.empty_cache()
|
| 108 |
+
return audio
|
| 109 |
+
|
| 110 |
+
def get_infer_param(self, length, noise, noisew, text=None, speaker_id=None, audio_path=None,
|
| 111 |
+
emotion=None):
|
| 112 |
+
emo = None
|
| 113 |
+
if self.model_type != "hubert":
|
| 114 |
+
length_scale, text = self.get_label_value('LENGTH', length, 'length scale', text)
|
| 115 |
+
noise_scale, text = self.get_label_value('NOISE', noise, 'noise scale', text)
|
| 116 |
+
noise_scale_w, text = self.get_label_value('NOISEW', noisew, 'deviation of noise', text)
|
| 117 |
+
cleaned, text = self.get_label(text, 'CLEANED')
|
| 118 |
+
|
| 119 |
+
stn_tst = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
|
| 120 |
+
sid = LongTensor([speaker_id])
|
| 121 |
+
|
| 122 |
+
if self.model_type == "w2v2":
|
| 123 |
+
# if emotion_reference.endswith('.npy'):
|
| 124 |
+
# emotion = np.load(emotion_reference)
|
| 125 |
+
# emotion = FloatTensor(emotion).unsqueeze(0)
|
| 126 |
# else:
|
| 127 |
+
# audio16000, sampling_rate = librosa.load(
|
| 128 |
+
# emotion_reference, sr=16000, mono=True)
|
| 129 |
+
# emotion = self.w2v2(audio16000, sampling_rate)[
|
| 130 |
+
# 'hidden_states']
|
| 131 |
+
# emotion_reference = re.sub(
|
| 132 |
+
# r'\..*$', '', emotion_reference)
|
| 133 |
+
# np.save(emotion_reference, emotion.squeeze(0))
|
| 134 |
+
# emotion = FloatTensor(emotion)
|
| 135 |
+
emo = torch.FloatTensor(self.emotion_reference[emotion]).unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
|
|
|
|
|
|
| 137 |
|
| 138 |
+
elif self.model_type == "hubert":
|
| 139 |
+
if self.use_f0:
|
| 140 |
+
audio, sampling_rate = librosa.load(
|
| 141 |
+
audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
|
| 142 |
+
audio16000 = librosa.resample(
|
| 143 |
+
audio, orig_sr=sampling_rate, target_sr=16000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
else:
|
| 145 |
+
audio16000, sampling_rate = librosa.load(
|
| 146 |
+
audio_path, sr=16000, mono=True)
|
| 147 |
+
|
| 148 |
+
length_scale = self.get_label_value('LENGTH', length, 'length scale')
|
| 149 |
+
noise_scale = self.get_label_value('NOISE', noise, 'noise scale')
|
| 150 |
+
noise_scale_w = self.get_label_value('NOISEW', noisew, 'deviation of noise')
|
| 151 |
+
|
| 152 |
+
with inference_mode():
|
| 153 |
+
units = self.hubert.units(FloatTensor(audio16000).unsqueeze(0).unsqueeze(0)).squeeze(0).numpy()
|
| 154 |
+
if self.use_f0:
|
| 155 |
+
f0_scale = self.get_label_value('F0', 1, 'f0 scale')
|
| 156 |
+
f0 = librosa.pyin(audio,
|
| 157 |
+
sr=sampling_rate,
|
| 158 |
+
fmin=librosa.note_to_hz('C0'),
|
| 159 |
+
fmax=librosa.note_to_hz('C7'),
|
| 160 |
+
frame_length=1780)[0]
|
| 161 |
+
target_length = len(units[:, 0])
|
| 162 |
+
f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
|
| 163 |
+
np.arange(0, len(f0)), f0)) * f0_scale
|
| 164 |
+
units[:, 0] = f0 / 10
|
| 165 |
+
|
| 166 |
+
stn_tst = FloatTensor(units)
|
| 167 |
+
sid = LongTensor([speaker_id])
|
| 168 |
+
params = {"length_scale": length_scale, "noise_scale": noise_scale,
|
| 169 |
+
"noise_scale_w": noise_scale_w, "stn_tst": stn_tst,
|
| 170 |
+
"sid": sid, "emotion": emo}
|
| 171 |
+
return params
|
| 172 |
+
|
| 173 |
+
def get_audio(self, voice, auto_break=False):
|
| 174 |
+
text = voice.get("text", None)
|
| 175 |
+
speaker_id = voice.get("id", 0)
|
| 176 |
+
length = voice.get("length", 1)
|
| 177 |
+
noise = voice.get("noise", 0.667)
|
| 178 |
+
noisew = voice.get("noisew", 0.8)
|
| 179 |
+
max = voice.get("max", 50)
|
| 180 |
+
lang = voice.get("lang", "auto")
|
| 181 |
+
speaker_lang = voice.get("speaker_lang", None)
|
| 182 |
+
audio_path = voice.get("audio_path", None)
|
| 183 |
+
emotion = voice.get("emotion", 0)
|
| 184 |
|
| 185 |
+
# 去除所有多余的空白字符
|
| 186 |
+
if text is not None: text = re.sub(r'\s+', ' ', text).strip()
|
| 187 |
+
|
| 188 |
+
# 停顿0.75s,避免语音分段合成再拼接后的连接突兀
|
| 189 |
+
brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
|
| 190 |
+
|
| 191 |
+
tasks = []
|
| 192 |
+
if self.model_type == "vits":
|
| 193 |
+
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
| 194 |
+
for sentence in sentence_list:
|
| 195 |
+
tasks.append(
|
| 196 |
+
self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
|
| 197 |
+
noisew=noisew))
|
| 198 |
+
audios = []
|
| 199 |
+
|
| 200 |
+
for task in tasks:
|
| 201 |
+
audios.append(self.infer(task))
|
| 202 |
+
if auto_break:
|
| 203 |
+
audios.append(brk)
|
| 204 |
+
|
| 205 |
+
audio = np.concatenate(audios, axis=0)
|
| 206 |
+
|
| 207 |
+
elif self.model_type == "hubert":
|
| 208 |
+
params = self.get_infer_param(speaker_id=speaker_id, length=length, noise=noise, noisew=noisew,
|
| 209 |
+
audio_path=audio_path)
|
| 210 |
+
audio = self.infer(params)
|
| 211 |
+
|
| 212 |
+
elif self.model_type == "w2v2":
|
| 213 |
+
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
| 214 |
+
for sentence in sentence_list:
|
| 215 |
+
tasks.append(
|
| 216 |
+
self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
|
| 217 |
+
noisew=noisew, emotion=emotion))
|
| 218 |
+
|
| 219 |
+
audios = []
|
| 220 |
+
for task in tasks:
|
| 221 |
+
audios.append(self.infer(task))
|
| 222 |
+
if auto_break:
|
| 223 |
+
audios.append(brk)
|
| 224 |
+
|
| 225 |
+
audio = np.concatenate(audios, axis=0)
|
| 226 |
+
|
| 227 |
+
return audio
|
| 228 |
+
|
| 229 |
+
def voice_conversion(self, voice):
|
| 230 |
+
audio_path = voice.get("audio_path")
|
| 231 |
+
original_id = voice.get("original_id")
|
| 232 |
+
target_id = voice.get("target_id")
|
| 233 |
|
| 234 |
audio = utils.load_audio_to_torch(
|
| 235 |
audio_path, self.hps_ms.data.sampling_rate)
|
|
|
|
| 245 |
|
| 246 |
with no_grad():
|
| 247 |
sid_tgt = LongTensor([target_id])
|
| 248 |
+
audio = self.net_g_ms.voice_conversion(spec.to(device),
|
| 249 |
+
spec_lengths.to(device),
|
| 250 |
+
sid_src=sid_src.to(device),
|
| 251 |
+
sid_tgt=sid_tgt.to(device))[0][0, 0].data.cpu().float().numpy()
|
| 252 |
+
|
| 253 |
+
torch.cuda.empty_cache()
|
| 254 |
|
| 255 |
+
return audio
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
class TTS:
|
| 259 |
+
def __init__(self, voice_obj, voice_speakers):
|
| 260 |
+
self._voice_obj = voice_obj
|
| 261 |
+
self._voice_speakers = voice_speakers
|
| 262 |
+
self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
|
| 263 |
+
self._speakers_count = sum([len(self._voice_speakers[i]) for i in self._voice_speakers])
|
| 264 |
+
self._vits_speakers_count = len(self._voice_speakers["VITS"])
|
| 265 |
+
self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
|
| 266 |
+
self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
|
| 267 |
+
self.dem = None
|
| 268 |
+
if getattr(config, "DIMENSIONAL_EMOTION_MODEL", None) != None:
|
| 269 |
+
try:
|
| 270 |
+
import audonnx
|
| 271 |
+
root = os.path.dirname(config.DIMENSIONAL_EMOTION_MODEL)
|
| 272 |
+
model_file = config.DIMENSIONAL_EMOTION_MODEL
|
| 273 |
+
self.dem = audonnx.load(root=root, model_file=model_file)
|
| 274 |
+
except Exception as e:
|
| 275 |
+
self.logger.warning(f"Load DIMENSIONAL_EMOTION_MODEL failed {e}")
|
| 276 |
+
|
| 277 |
+
# Initialization information
|
| 278 |
+
self.logger = logging.getLogger("vits-simple-api")
|
| 279 |
+
self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
|
| 280 |
+
self.logger.info(f'device:{device} device.type:{device.type}')
|
| 281 |
+
if self._vits_speakers_count != 0: self.logger.info(f"[VITS] {self._vits_speakers_count} speakers")
|
| 282 |
+
if self._hubert_speakers_count != 0: self.logger.info(f"[hubert] {self._hubert_speakers_count} speakers")
|
| 283 |
+
if self._w2v2_speakers_count != 0: self.logger.info(f"[w2v2] {self._w2v2_speakers_count} speakers")
|
| 284 |
+
self.logger.info(f"{self._speakers_count} speakers in total")
|
| 285 |
+
if self._speakers_count == 0:
|
| 286 |
+
self.logger.warning(f"No model was loaded")
|
| 287 |
+
|
| 288 |
+
@property
|
| 289 |
+
def voice_speakers(self):
|
| 290 |
+
return self._voice_speakers
|
| 291 |
+
|
| 292 |
+
@property
|
| 293 |
+
def speakers_count(self):
|
| 294 |
+
return self._speakers_count
|
| 295 |
+
|
| 296 |
+
@property
|
| 297 |
+
def vits_speakers_count(self):
|
| 298 |
+
return self._vits_speakers_count
|
| 299 |
+
|
| 300 |
+
@property
|
| 301 |
+
def hubert_speakers_count(self):
|
| 302 |
+
return self._hubert_speakers_count
|
| 303 |
+
|
| 304 |
+
@property
|
| 305 |
+
def w2v2_speakers_count(self):
|
| 306 |
+
return self._w2v2_speakers_count
|
| 307 |
+
|
| 308 |
+
def encode(self, sampling_rate, audio, format):
|
| 309 |
with BytesIO() as f:
|
| 310 |
+
write(f, sampling_rate, audio)
|
| 311 |
+
if format.upper() == 'OGG':
|
| 312 |
+
with BytesIO() as o:
|
| 313 |
+
utils.wav2ogg(f, o)
|
| 314 |
+
return BytesIO(o.getvalue())
|
| 315 |
+
elif format.upper() == 'SILK':
|
| 316 |
+
return BytesIO(silkcoder.encode(f))
|
| 317 |
+
elif format.upper() == 'MP3':
|
| 318 |
+
with BytesIO() as o:
|
| 319 |
+
utils.wav2mp3(f, o)
|
| 320 |
+
return BytesIO(o.getvalue())
|
| 321 |
+
elif format.upper() == 'WAV':
|
| 322 |
+
return BytesIO(f.getvalue())
|
| 323 |
+
|
| 324 |
+
def convert_time_string(self, time_string):
|
| 325 |
+
time_value = float(re.findall(r'\d+\.?\d*', time_string)[0])
|
| 326 |
+
time_unit = re.findall(r'[a-zA-Z]+', time_string)[0].lower()
|
| 327 |
+
|
| 328 |
+
if time_unit.upper() == 'MS':
|
| 329 |
+
return time_value / 1000
|
| 330 |
+
elif time_unit.upper() == 'S':
|
| 331 |
+
return time_value
|
| 332 |
+
elif time_unit.upper() == 'MIN':
|
| 333 |
+
return time_value * 60
|
| 334 |
+
elif time_unit.upper() == 'H':
|
| 335 |
+
return time_value * 3600
|
| 336 |
+
elif time_unit.upper() == 'D':
|
| 337 |
+
return time_value * 24 * 3600 # 不会有人真写D吧?
|
| 338 |
+
else:
|
| 339 |
+
raise ValueError("Unsupported time unit: {}".format(time_unit))
|
| 340 |
+
|
| 341 |
+
def parse_ssml(self, ssml):
|
| 342 |
+
root = ET.fromstring(ssml)
|
| 343 |
+
format = root.attrib.get("format", "wav")
|
| 344 |
+
voice_tasks = []
|
| 345 |
+
brk_count = 0
|
| 346 |
+
strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
|
| 347 |
+
|
| 348 |
+
for element in root.iter():
|
| 349 |
+
if element.tag == "voice":
|
| 350 |
+
id = int(element.attrib.get("id", root.attrib.get("id", config.ID)))
|
| 351 |
+
lang = element.attrib.get("lang", root.attrib.get("lang", config.LANG))
|
| 352 |
+
length = float(element.attrib.get("length", root.attrib.get("length", config.LENGTH)))
|
| 353 |
+
noise = float(element.attrib.get("noise", root.attrib.get("noise", config.NOISE)))
|
| 354 |
+
noisew = float(element.attrib.get("noisew", root.attrib.get("noisew", config.NOISEW)))
|
| 355 |
+
max = int(element.attrib.get("max", root.attrib.get("max", "0")))
|
| 356 |
+
# 不填写默认就是vits
|
| 357 |
+
model = element.attrib.get("model", root.attrib.get("model", "vits"))
|
| 358 |
+
# w2v2-vits/emotion-vits才有emotion
|
| 359 |
+
emotion = int(element.attrib.get("emotion", root.attrib.get("emotion", 0)))
|
| 360 |
+
|
| 361 |
+
voice_element = ET.tostring(element, encoding='unicode')
|
| 362 |
+
|
| 363 |
+
pattern_voice = r'<voice.*?>(.*?)</voice>'
|
| 364 |
+
pattern_break = r'<break\s*?(.*?)\s*?/>'
|
| 365 |
+
|
| 366 |
+
matches_voice = re.findall(pattern_voice, voice_element)[0]
|
| 367 |
+
matches_break = re.split(pattern_break, matches_voice)
|
| 368 |
+
for match in matches_break:
|
| 369 |
+
strength = re.search(r'\s*strength\s*=\s*[\'\"](.*?)[\'\"]', match)
|
| 370 |
+
time = re.search(r'\s*time\s*=\s*[\'\"](.*?)[\'\"]', match)
|
| 371 |
+
# break标签 strength属性
|
| 372 |
+
if strength:
|
| 373 |
+
brk = strength_dict[strength.group(1)]
|
| 374 |
+
voice_tasks.append({"break": brk})
|
| 375 |
+
brk_count += 1
|
| 376 |
+
# break标签 time属性
|
| 377 |
+
elif time:
|
| 378 |
+
brk = self.convert_time_string(time.group(1))
|
| 379 |
+
voice_tasks.append({"break": brk})
|
| 380 |
+
brk_count += 1
|
| 381 |
+
# break标签 为空说明只写了break,默认停顿0.75s
|
| 382 |
+
elif match == "":
|
| 383 |
+
voice_tasks.append({"break": 0.75})
|
| 384 |
+
brk_count += 1
|
| 385 |
+
# voice标签中除了break剩下的就是文本
|
| 386 |
+
else:
|
| 387 |
+
voice_tasks.append({"id": id,
|
| 388 |
+
"text": match,
|
| 389 |
+
"lang": lang,
|
| 390 |
+
"length": length,
|
| 391 |
+
"noise": noise,
|
| 392 |
+
"noisew": noisew,
|
| 393 |
+
"max": max,
|
| 394 |
+
"model": model,
|
| 395 |
+
"emotion": emotion
|
| 396 |
+
})
|
| 397 |
+
|
| 398 |
+
# 分段末尾停顿0.75s
|
| 399 |
+
voice_tasks.append({"break": 0.75})
|
| 400 |
+
elif element.tag == "break":
|
| 401 |
+
# brk_count大于0说明voice标签中有break
|
| 402 |
+
if brk_count > 0:
|
| 403 |
+
brk_count -= 1
|
| 404 |
+
continue
|
| 405 |
+
brk = strength_dict.get(element.attrib.get("strength"),
|
| 406 |
+
self.convert_time_string(element.attrib.get("time", "750ms")))
|
| 407 |
+
voice_tasks.append({"break": brk})
|
| 408 |
+
|
| 409 |
+
for i in voice_tasks:
|
| 410 |
+
self.logger.debug(i)
|
| 411 |
+
|
| 412 |
+
return voice_tasks, format
|
| 413 |
+
|
| 414 |
+
def create_ssml_infer_task(self, ssml):
|
| 415 |
+
voice_tasks, format = self.parse_ssml(ssml)
|
| 416 |
+
|
| 417 |
+
audios = []
|
| 418 |
+
for voice in voice_tasks:
|
| 419 |
+
if voice.get("break"):
|
| 420 |
+
audios.append(np.zeros(int(voice.get("break") * 22050), dtype=np.int16))
|
| 421 |
+
else:
|
| 422 |
+
model = voice.get("model").upper()
|
| 423 |
+
if model != "VITS" and model != "W2V2-VITS" and model != "EMOTION-VITS":
|
| 424 |
+
raise ValueError(f"Unsupported model: {voice.get('model')}")
|
| 425 |
+
voice_obj = self._voice_obj[model][voice.get("id")][1]
|
| 426 |
+
voice["id"] = self._voice_obj[model][voice.get("id")][0]
|
| 427 |
+
|
| 428 |
+
audios.append(voice_obj.get_audio(voice))
|
| 429 |
+
|
| 430 |
+
audio = np.concatenate(audios, axis=0)
|
| 431 |
+
|
| 432 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format), format
|
| 433 |
+
|
| 434 |
+
def vits_infer(self, voice):
|
| 435 |
+
format = voice.get("format", "wav")
|
| 436 |
+
voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
|
| 437 |
+
voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
|
| 438 |
+
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 439 |
+
|
| 440 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
| 441 |
+
|
| 442 |
+
def hubert_vits_infer(self, voice):
|
| 443 |
+
format = voice.get("format", "wav")
|
| 444 |
+
voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
|
| 445 |
+
voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
|
| 446 |
+
audio = voice_obj.get_audio(voice)
|
| 447 |
+
|
| 448 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
| 449 |
+
|
| 450 |
+
def w2v2_vits_infer(self, voice):
|
| 451 |
+
format = voice.get("format", "wav")
|
| 452 |
+
voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
|
| 453 |
+
voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
|
| 454 |
+
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 455 |
+
|
| 456 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
| 457 |
+
|
| 458 |
+
def vits_voice_conversion(self, voice):
|
| 459 |
+
original_id = voice.get("original_id")
|
| 460 |
+
target_id = voice.get("target_id")
|
| 461 |
+
format = voice.get("format")
|
| 462 |
+
|
| 463 |
+
original_id_obj = int(self._voice_obj["VITS"][original_id][2])
|
| 464 |
+
target_id_obj = int(self._voice_obj["VITS"][target_id][2])
|
| 465 |
+
|
| 466 |
+
if original_id_obj != target_id_obj:
|
| 467 |
+
raise ValueError(f"speakers are in diffrent VITS Model")
|
| 468 |
+
|
| 469 |
+
voice["original_id"] = int(self._voice_obj["VITS"][original_id][0])
|
| 470 |
+
voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
|
| 471 |
+
|
| 472 |
+
voice_obj = self._voice_obj["VITS"][original_id][1]
|
| 473 |
+
audio = voice_obj.voice_conversion(voice)
|
| 474 |
+
|
| 475 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
| 476 |
+
|
| 477 |
+
def get_dimensional_emotion_npy(self, audio):
|
| 478 |
+
if self.dem is None:
|
| 479 |
+
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_MODEL path in config.py")
|
| 480 |
+
audio16000, sampling_rate = librosa.load(audio, sr=16000, mono=True)
|
| 481 |
+
emotion = self.dem(audio16000, sampling_rate)['hidden_states']
|
| 482 |
+
emotion_npy = BytesIO()
|
| 483 |
+
np.save(emotion_npy, emotion.squeeze(0))
|
| 484 |
+
emotion_npy.seek(0)
|
| 485 |
+
|
| 486 |
+
return emotion_npy
|