Spaces:
Sleeping
Sleeping
Upload 44 files
Browse files- Dockerfile +8 -8
- Dockerfile_GPU +37 -0
- README_zh.md +16 -14
- app.py +14 -28
- config.py +10 -4
- docker-compose-gpu.yaml +15 -0
- docker-compose.yaml +3 -1
- gunicorn_config.py +4 -0
- logger.py +42 -0
- requirements.txt +2 -1
- static/css/style.css +84 -0
- templates/index.html +267 -121
- text/cleaners.py +15 -0
- text/mandarin.py +2 -3
- utils/merge.py +16 -8
- utils/nlp.py +1 -7
- vits-simple-api-installer-latest.sh +26 -1
- voice.py +14 -15
Dockerfile
CHANGED
|
@@ -6,15 +6,13 @@ WORKDIR /app
|
|
| 6 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 7 |
|
| 8 |
RUN apt-get update && \
|
| 9 |
-
apt install build-essential -
|
| 10 |
-
apt install espeak-ng -yq && \
|
| 11 |
-
apt install cmake -yq && \
|
| 12 |
-
apt install -y wget -yq && \
|
| 13 |
apt-get clean && \
|
| 14 |
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
| 15 |
rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
-
RUN pip install
|
|
|
|
| 18 |
|
| 19 |
RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
|
| 20 |
tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
|
|
@@ -25,13 +23,15 @@ RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openj
|
|
| 25 |
rm -f openjtalk-0.3.0.dev2.tar.gz && \
|
| 26 |
rm -rf openjtalk-0.3.0.dev2
|
| 27 |
|
| 28 |
-
RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
|
| 29 |
|
| 30 |
COPY requirements.txt /app
|
| 31 |
-
RUN pip install -r requirements.txt
|
|
|
|
|
|
|
| 32 |
|
| 33 |
COPY . /app
|
| 34 |
|
| 35 |
EXPOSE 23456
|
| 36 |
|
| 37 |
-
CMD ["
|
|
|
|
| 6 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 7 |
|
| 8 |
RUN apt-get update && \
|
| 9 |
+
apt-get install -yq build-essential espeak-ng cmake wget && \
|
|
|
|
|
|
|
|
|
|
| 10 |
apt-get clean && \
|
| 11 |
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
| 12 |
rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
+
RUN pip install --upgrade pip --no-cache-dir && \
|
| 15 |
+
pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0 safetensors==0.3.2 --no-cache-dir
|
| 16 |
|
| 17 |
RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
|
| 18 |
tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
|
|
|
|
| 23 |
rm -f openjtalk-0.3.0.dev2.tar.gz && \
|
| 24 |
rm -rf openjtalk-0.3.0.dev2
|
| 25 |
|
| 26 |
+
RUN pip install torch --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
|
| 27 |
|
| 28 |
COPY requirements.txt /app
|
| 29 |
+
RUN pip install -r requirements.txt --no-cache-dir
|
| 30 |
+
|
| 31 |
+
RUN pip install gunicorn --no-cache-dir
|
| 32 |
|
| 33 |
COPY . /app
|
| 34 |
|
| 35 |
EXPOSE 23456
|
| 36 |
|
| 37 |
+
CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
|
Dockerfile_GPU
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10.11-slim-bullseye
|
| 2 |
+
|
| 3 |
+
RUN mkdir -p /app
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 7 |
+
|
| 8 |
+
RUN apt-get update && \
|
| 9 |
+
apt-get install -yq build-essential espeak-ng cmake wget && \
|
| 10 |
+
apt-get clean && \
|
| 11 |
+
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
| 12 |
+
rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
RUN pip install --upgrade pip --no-cache-dir && \
|
| 15 |
+
pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0 safetensors==0.3.2 --no-cache-dir
|
| 16 |
+
|
| 17 |
+
RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
|
| 18 |
+
tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
|
| 19 |
+
cd openjtalk-0.3.0.dev2 && \
|
| 20 |
+
rm -rf ./pyopenjtalk/open_jtalk_dic_utf_8-1.11 && \
|
| 21 |
+
python setup.py install && \
|
| 22 |
+
cd ../ && \
|
| 23 |
+
rm -f openjtalk-0.3.0.dev2.tar.gz && \
|
| 24 |
+
rm -rf openjtalk-0.3.0.dev2
|
| 25 |
+
|
| 26 |
+
RUN pip install torch --index-url https://download.pytorch.org/whl/cu117 --no-cache-dir
|
| 27 |
+
|
| 28 |
+
COPY requirements.txt /app
|
| 29 |
+
RUN pip install -r requirements.txt --no-cache-dir
|
| 30 |
+
|
| 31 |
+
RUN pip install gunicorn --no-cache-dir
|
| 32 |
+
|
| 33 |
+
COPY . /app
|
| 34 |
+
|
| 35 |
+
EXPOSE 23456
|
| 36 |
+
|
| 37 |
+
CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
|
README_zh.md
CHANGED
|
@@ -63,7 +63,7 @@
|
|
| 63 |
|
| 64 |
|
| 65 |
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
|
| 66 |
-
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text
|
| 67 |
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
|
| 68 |
- 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
|
| 69 |
- 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
|
|
@@ -495,14 +495,15 @@ def voice_dimensional_emotion(upload_path):
|
|
| 495 |
|
| 496 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
| 497 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
|
| 498 |
-
| 合成文本 | text | true | | str |
|
| 499 |
-
| 角色id | id | false | 0 | int |
|
| 500 |
| 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
|
| 501 |
| 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
| 502 |
-
| 语音长度/语速 | length | false | 1.0 | float |
|
| 503 |
-
| 噪声 | noise | false | 0.
|
| 504 |
-
|
|
| 505 |
| 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
|
|
|
|
| 506 |
|
| 507 |
## VITS 语音转换
|
| 508 |
|
|
@@ -516,12 +517,12 @@ def voice_dimensional_emotion(upload_path):
|
|
| 516 |
|
| 517 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
| 518 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
|
| 519 |
-
| 上传音频 | upload | true | | file |
|
| 520 |
-
| 目标角色id | id | true | | int |
|
| 521 |
| 音频格式 | format | true | | str | wav,ogg,silk |
|
| 522 |
| 语音长度/语速 | length | true | | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
| 523 |
-
| 噪声 | noise | true | | float |
|
| 524 |
-
|
|
| 525 |
|
| 526 |
## Dimensional emotion
|
| 527 |
|
|
@@ -533,13 +534,13 @@ def voice_dimensional_emotion(upload_path):
|
|
| 533 |
|
| 534 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
| 535 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
|
| 536 |
-
|
|
| 537 |
-
| 角色id | id | false | 0 | int |
|
| 538 |
| 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
|
| 539 |
| 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
| 540 |
| 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
| 541 |
-
| 噪声 | noise | false | 0.
|
| 542 |
-
|
|
| 543 |
| 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
|
| 544 |
| 维度情感 | emotion | false | 0 | int | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
|
| 545 |
|
|
@@ -623,4 +624,5 @@ def voice_dimensional_emotion(upload_path):
|
|
| 623 |
- MoeGoe:https://github.com/CjangCjengh/MoeGoe
|
| 624 |
- emotional-vits:https://github.com/innnky/emotional-vits
|
| 625 |
- vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
|
|
|
|
| 626 |
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
|
| 66 |
+
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=我觉得1%2B1≠3&id=164&lang=zh`(get中一些字符需要转义不然会被过滤掉)
|
| 67 |
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
|
| 68 |
- 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
|
| 69 |
- 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
|
|
|
|
| 495 |
|
| 496 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
| 497 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
|
| 498 |
+
| 合成文本 | text | true | | str | 需要合成语音的文本。 |
|
| 499 |
+
| 角色id | id | false | 0 | int | 即说话人id。 |
|
| 500 |
| 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
|
| 501 |
| 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
| 502 |
+
| 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢。 |
|
| 503 |
+
| 噪声 | noise | false | 0.33 | float | 样本噪声,控制合成的随机性。 |
|
| 504 |
+
| sdp噪声 | noisew | false | 0.4 | float | 随机时长预测器噪声,控制音素发音长度。 |
|
| 505 |
| 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
|
| 506 |
+
| 流式响应 | streaming | false | false | bool | 流式合成语音,更快的首包响应。 |
|
| 507 |
|
| 508 |
## VITS 语音转换
|
| 509 |
|
|
|
|
| 517 |
|
| 518 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
| 519 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
|
| 520 |
+
| 上传音频 | upload | true | | file | 需要转换说话人的音频文件。 |
|
| 521 |
+
| 目标角色id | id | true | | int | 目标说话人id。 |
|
| 522 |
| 音频格式 | format | true | | str | wav,ogg,silk |
|
| 523 |
| 语音长度/语速 | length | true | | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
| 524 |
+
| 噪声 | noise | true | | float | 样本噪声,控制合成的随机性。 |
|
| 525 |
+
| sdp噪声 | noisew | true | | float | 随机时长预测器噪声,控制音素发音长度。 |
|
| 526 |
|
| 527 |
## Dimensional emotion
|
| 528 |
|
|
|
|
| 534 |
|
| 535 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
| 536 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
|
| 537 |
+
| 合���文本 | text | true | | str | 需要合成语音的文本。 |
|
| 538 |
+
| 角色id | id | false | 0 | int | 即说话人id。 |
|
| 539 |
| 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
|
| 540 |
| 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
| 541 |
| 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
| 542 |
+
| 噪声 | noise | false | 0.33 | float | 样本噪声,控制合成的随机性。 |
|
| 543 |
+
| sdp噪声 | noisew | false | 0.4 | float | 随机时长预测器噪声,控制音素发音长度。 |
|
| 544 |
| 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
|
| 545 |
| 维度情感 | emotion | false | 0 | int | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
|
| 546 |
|
|
|
|
| 624 |
- MoeGoe:https://github.com/CjangCjengh/MoeGoe
|
| 625 |
- emotional-vits:https://github.com/innnky/emotional-vits
|
| 626 |
- vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
|
| 627 |
+
- vits_chinese:https://github.com/PlayVoice/vits_chinese
|
| 628 |
|
app.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
-
import logging
|
| 3 |
import time
|
| 4 |
-
import logzero
|
| 5 |
import uuid
|
|
|
|
| 6 |
from flask import Flask, request, send_file, jsonify, make_response, render_template
|
| 7 |
from werkzeug.utils import secure_filename
|
| 8 |
from flask_apscheduler import APScheduler
|
|
@@ -19,24 +18,15 @@ scheduler.init_app(app)
|
|
| 19 |
if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
|
| 20 |
scheduler.start()
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
logging.basicConfig(level=level_dict[level])
|
| 28 |
-
logging.getLogger('numba').setLevel(logging.WARNING)
|
| 29 |
-
logging.getLogger("langid.langid").setLevel(logging.INFO)
|
| 30 |
-
logging.getLogger("apscheduler.scheduler").setLevel(logging.INFO)
|
| 31 |
|
|
|
|
| 32 |
tts = merge_model(app.config["MODEL_LIST"])
|
| 33 |
|
| 34 |
-
if not os.path.exists(app.config['UPLOAD_FOLDER']):
|
| 35 |
-
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
| 36 |
-
|
| 37 |
-
if not os.path.exists(app.config['CACHE_PATH']):
|
| 38 |
-
os.makedirs(app.config['CACHE_PATH'], exist_ok=True)
|
| 39 |
-
|
| 40 |
|
| 41 |
def require_api_key(func):
|
| 42 |
@wraps(func)
|
|
@@ -57,7 +47,10 @@ def require_api_key(func):
|
|
| 57 |
def index():
|
| 58 |
kwargs = {
|
| 59 |
"speakers": tts.voice_speakers,
|
| 60 |
-
"speakers_count": tts.speakers_count
|
|
|
|
|
|
|
|
|
|
| 61 |
}
|
| 62 |
return render_template("index.html", **kwargs)
|
| 63 |
|
|
@@ -362,25 +355,18 @@ def ssml():
|
|
| 362 |
return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
|
| 363 |
|
| 364 |
logger.debug(ssml)
|
| 365 |
-
|
| 366 |
fname = f"{str(uuid.uuid1())}.{format}"
|
| 367 |
file_type = f"audio/{format}"
|
| 368 |
|
| 369 |
t1 = time.time()
|
| 370 |
-
audio
|
| 371 |
t2 = time.time()
|
| 372 |
if app.config.get("SAVE_AUDIO", False):
|
| 373 |
logger.debug(f"[ssml] {fname}")
|
| 374 |
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
| 375 |
|
| 376 |
-
|
| 377 |
-
audio = tts.generate_audio_chunks(audio)
|
| 378 |
-
response = make_response(audio)
|
| 379 |
-
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
| 380 |
-
response.headers['Content-Type'] = file_type
|
| 381 |
-
return response
|
| 382 |
-
else:
|
| 383 |
-
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
| 384 |
|
| 385 |
|
| 386 |
@app.route('/voice/dimension-emotion', methods=["POST"])
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import time
|
|
|
|
| 3 |
import uuid
|
| 4 |
+
from logger import logger
|
| 5 |
from flask import Flask, request, send_file, jsonify, make_response, render_template
|
| 6 |
from werkzeug.utils import secure_filename
|
| 7 |
from flask_apscheduler import APScheduler
|
|
|
|
| 18 |
if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
|
| 19 |
scheduler.start()
|
| 20 |
|
| 21 |
+
for path in (app.config['LOGS_PATH'], app.config['UPLOAD_FOLDER'], app.config['CACHE_PATH']):
|
| 22 |
+
try:
|
| 23 |
+
os.makedirs(path, exist_ok=True)
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logger.error(f"Unable to create directory {path}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
# load model
|
| 28 |
tts = merge_model(app.config["MODEL_LIST"])
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def require_api_key(func):
|
| 32 |
@wraps(func)
|
|
|
|
| 47 |
def index():
|
| 48 |
kwargs = {
|
| 49 |
"speakers": tts.voice_speakers,
|
| 50 |
+
"speakers_count": tts.speakers_count,
|
| 51 |
+
"vits_speakers_count":tts._vits_speakers_count,
|
| 52 |
+
"w2v2_speakers_count":tts._w2v2_speakers_count,
|
| 53 |
+
"w2v2_emotion_count":tts._w2v2_emotion_count
|
| 54 |
}
|
| 55 |
return render_template("index.html", **kwargs)
|
| 56 |
|
|
|
|
| 355 |
return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
|
| 356 |
|
| 357 |
logger.debug(ssml)
|
| 358 |
+
voice_tasks, format = tts.parse_ssml(ssml)
|
| 359 |
fname = f"{str(uuid.uuid1())}.{format}"
|
| 360 |
file_type = f"audio/{format}"
|
| 361 |
|
| 362 |
t1 = time.time()
|
| 363 |
+
audio = tts.create_ssml_infer_task(voice_tasks, format, fname)
|
| 364 |
t2 = time.time()
|
| 365 |
if app.config.get("SAVE_AUDIO", False):
|
| 366 |
logger.debug(f"[ssml] {fname}")
|
| 367 |
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
| 368 |
|
| 369 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
|
| 372 |
@app.route('/voice/dimension-emotion', methods=["POST"])
|
config.py
CHANGED
|
@@ -12,7 +12,7 @@ DEBUG = False
|
|
| 12 |
PORT = 7860
|
| 13 |
|
| 14 |
# Absolute path of vits-simple-api
|
| 15 |
-
ABS_PATH = os.path.
|
| 16 |
|
| 17 |
# Upload path
|
| 18 |
UPLOAD_FOLDER = ABS_PATH + "/upload"
|
|
@@ -20,6 +20,12 @@ UPLOAD_FOLDER = ABS_PATH + "/upload"
|
|
| 20 |
# Cahce path
|
| 21 |
CACHE_PATH = ABS_PATH + "/cache"
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
|
| 24 |
CLEAN_INTERVAL_SECONDS = 3600
|
| 25 |
|
|
@@ -39,7 +45,7 @@ API_KEY = "api-key"
|
|
| 39 |
LOGGING_LEVEL = "DEBUG"
|
| 40 |
|
| 41 |
# Language identification library. Optional fastlid, langid
|
| 42 |
-
LANGUAGE_IDENTIFICATION_LIBRARY = "
|
| 43 |
|
| 44 |
# To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
|
| 45 |
# If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
|
|
@@ -48,7 +54,7 @@ ESPEAK_LIBRARY = ""
|
|
| 48 |
|
| 49 |
# Fill in the model path here
|
| 50 |
MODEL_LIST = [
|
| 51 |
-
|
| 52 |
[ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/1374_epochs.pth", ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/config.json"],
|
| 53 |
[ABS_PATH + "/Model/vctk/pretrained_vctk.pth", ABS_PATH + "/Model/vctk/vctk_base.json"],
|
| 54 |
[ABS_PATH + "/Model/paimon/paimon6k_390000.pth", ABS_PATH + "/Model/paimon/paimon6k.json"],
|
|
@@ -73,7 +79,7 @@ HUBERT_SOFT_MODEL = ABS_PATH + "/Model/hubert-soft-0d54a1f4.pt"
|
|
| 73 |
DIMENSIONAL_EMOTION_NPY = ABS_PATH + "/Model/npy"
|
| 74 |
|
| 75 |
# w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path.
|
| 76 |
-
DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
|
| 77 |
|
| 78 |
"""
|
| 79 |
Default parameter
|
|
|
|
| 12 |
PORT = 7860
|
| 13 |
|
| 14 |
# Absolute path of vits-simple-api
|
| 15 |
+
ABS_PATH = os.path.dirname(os.path.realpath(__file__))
|
| 16 |
|
| 17 |
# Upload path
|
| 18 |
UPLOAD_FOLDER = ABS_PATH + "/upload"
|
|
|
|
| 20 |
# Cahce path
|
| 21 |
CACHE_PATH = ABS_PATH + "/cache"
|
| 22 |
|
| 23 |
+
# Logs path
|
| 24 |
+
LOGS_PATH = ABS_PATH + "/logs"
|
| 25 |
+
|
| 26 |
+
# Set the number of backup log files to keep.
|
| 27 |
+
LOGS_BACKUPCOUNT = 30
|
| 28 |
+
|
| 29 |
# If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
|
| 30 |
CLEAN_INTERVAL_SECONDS = 3600
|
| 31 |
|
|
|
|
| 45 |
LOGGING_LEVEL = "DEBUG"
|
| 46 |
|
| 47 |
# Language identification library. Optional fastlid, langid
|
| 48 |
+
LANGUAGE_IDENTIFICATION_LIBRARY = "fastlid"
|
| 49 |
|
| 50 |
# To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
|
| 51 |
# If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
|
|
|
|
| 54 |
|
| 55 |
# Fill in the model path here
|
| 56 |
MODEL_LIST = [
|
| 57 |
+
# VITS
|
| 58 |
[ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/1374_epochs.pth", ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/config.json"],
|
| 59 |
[ABS_PATH + "/Model/vctk/pretrained_vctk.pth", ABS_PATH + "/Model/vctk/vctk_base.json"],
|
| 60 |
[ABS_PATH + "/Model/paimon/paimon6k_390000.pth", ABS_PATH + "/Model/paimon/paimon6k.json"],
|
|
|
|
| 79 |
DIMENSIONAL_EMOTION_NPY = ABS_PATH + "/Model/npy"
|
| 80 |
|
| 81 |
# w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path.
|
| 82 |
+
# DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
|
| 83 |
|
| 84 |
"""
|
| 85 |
Default parameter
|
docker-compose-gpu.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.4'
|
| 2 |
+
services:
|
| 3 |
+
vits:
|
| 4 |
+
image: artrajz/vits-simple-api:latest-gpu
|
| 5 |
+
restart: always
|
| 6 |
+
ports:
|
| 7 |
+
- 23456:23456
|
| 8 |
+
environment:
|
| 9 |
+
LANG: 'C.UTF-8'
|
| 10 |
+
TZ: Asia/Shanghai #timezone
|
| 11 |
+
volumes:
|
| 12 |
+
- ./Model:/app/Model # 挂载模型文件夹
|
| 13 |
+
- ./config.py:/app/config.py # 挂载配置文件
|
| 14 |
+
- ./logs:/app/logs # logging logs
|
| 15 |
+
- ./gunicorn_config.py:/app/gunicorn_config.py # gunicorn configuration
|
docker-compose.yaml
CHANGED
|
@@ -10,4 +10,6 @@ services:
|
|
| 10 |
TZ: Asia/Shanghai #timezone
|
| 11 |
volumes:
|
| 12 |
- ./Model:/app/Model # 挂载模型文件夹
|
| 13 |
-
- ./config.py:/app/config.py # 挂载配置文件
|
|
|
|
|
|
|
|
|
| 10 |
TZ: Asia/Shanghai #timezone
|
| 11 |
volumes:
|
| 12 |
- ./Model:/app/Model # 挂载模型文件夹
|
| 13 |
+
- ./config.py:/app/config.py # 挂载配置文件
|
| 14 |
+
- ./logs:/app/logs # logging logs
|
| 15 |
+
- ./gunicorn_config.py:/app/gunicorn_config.py # gunicorn configuration
|
gunicorn_config.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import multiprocessing
|
| 2 |
+
|
| 3 |
+
bind = "0.0.0.0:23456"
|
| 4 |
+
workers = multiprocessing.cpu_count()
|
logger.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import logging
|
| 4 |
+
import logzero
|
| 5 |
+
import config
|
| 6 |
+
from logging.handlers import TimedRotatingFileHandler
|
| 7 |
+
|
| 8 |
+
logzero.loglevel(logging.WARNING)
|
| 9 |
+
logger = logging.getLogger("vits-simple-api")
|
| 10 |
+
level = getattr(config, "LOGGING_LEVEL", "DEBUG")
|
| 11 |
+
level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
|
| 12 |
+
'CRITICAL': logging.CRITICAL}
|
| 13 |
+
logging.basicConfig(level=level_dict[level])
|
| 14 |
+
logging.getLogger('numba').setLevel(logging.WARNING)
|
| 15 |
+
logging.getLogger("langid.langid").setLevel(logging.INFO)
|
| 16 |
+
logging.getLogger("apscheduler.scheduler").setLevel(logging.INFO)
|
| 17 |
+
|
| 18 |
+
os.makedirs(config.LOGS_PATH, exist_ok=True)
|
| 19 |
+
log_file = os.path.join(config.LOGS_PATH, 'latest.log')
|
| 20 |
+
backup_count = getattr(config, "LOGS_BACKUPCOUNT", 30)
|
| 21 |
+
handler = TimedRotatingFileHandler(log_file, when="midnight", interval=1, backupCount=backup_count, encoding='utf-8')
|
| 22 |
+
handler.suffix = "%Y-%m-%d.log"
|
| 23 |
+
formatter = logging.Formatter('%(levelname)s:%(name)s %(message)s')
|
| 24 |
+
handler.setFormatter(formatter)
|
| 25 |
+
logger.addHandler(handler)
|
| 26 |
+
|
| 27 |
+
logging.getLogger("werkzeug").addHandler(handler)
|
| 28 |
+
logging.getLogger("apscheduler.scheduler").addHandler(handler)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# Custom function to handle uncaught exceptions
|
| 32 |
+
def handle_exception(exc_type, exc_value, exc_traceback):
|
| 33 |
+
# If it's a keyboard interrupt, don't handle it, just return
|
| 34 |
+
if issubclass(exc_type, KeyboardInterrupt):
|
| 35 |
+
sys.__excepthook__(exc_type, exc_value, exc_traceback)
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
logger.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# Set the global exception handler in Python
|
| 42 |
+
sys.excepthook = handle_exception
|
requirements.txt
CHANGED
|
@@ -27,4 +27,5 @@ fasttext
|
|
| 27 |
fastlid
|
| 28 |
langid
|
| 29 |
phonemizer==3.2.1
|
| 30 |
-
transformers
|
|
|
|
|
|
| 27 |
fastlid
|
| 28 |
langid
|
| 29 |
phonemizer==3.2.1
|
| 30 |
+
transformers
|
| 31 |
+
pydantic==1.10.6
|
static/css/style.css
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.main-container {
|
| 2 |
+
position: relative;
|
| 3 |
+
width: 100%;
|
| 4 |
+
min-height: 300px;
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
.container {
|
| 8 |
+
width: 300px;
|
| 9 |
+
position: relative;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
/*tabs*/
|
| 14 |
+
.tabs {
|
| 15 |
+
display: flex;
|
| 16 |
+
left: 0;
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
.tab-button {
|
| 20 |
+
display: inline-block;
|
| 21 |
+
background-color: transparent;
|
| 22 |
+
padding: 5px 10px;
|
| 23 |
+
cursor: pointer;
|
| 24 |
+
margin-bottom: -2px;
|
| 25 |
+
border-top: 2px solid transparent;
|
| 26 |
+
border-left: 2px solid transparent;
|
| 27 |
+
border-right: 2px solid transparent;
|
| 28 |
+
border-bottom: 0px;
|
| 29 |
+
border-top-left-radius: 0.5rem;
|
| 30 |
+
border-top-right-radius: 0.5rem;
|
| 31 |
+
color: gray;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
.tab-button.active {
|
| 35 |
+
background-color: white;
|
| 36 |
+
border-top: 2px solid #dee2e6;
|
| 37 |
+
border-left: 2px solid #dee2e6;
|
| 38 |
+
border-right: 2px solid #dee2e6;
|
| 39 |
+
color: black;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
/*content*/
|
| 43 |
+
|
| 44 |
+
.content {
|
| 45 |
+
border: gray;
|
| 46 |
+
border-left-width: 2px;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.content-pane {
|
| 50 |
+
display: none;
|
| 51 |
+
padding: 20px;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.content-pane.active {
|
| 55 |
+
display: flex;
|
| 56 |
+
-ms-flex-wrap: wrap;
|
| 57 |
+
flex-wrap: wrap;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
*, :before, :after {
|
| 61 |
+
box-sizing: border-box;
|
| 62 |
+
border-width: 0;
|
| 63 |
+
border-style: solid;
|
| 64 |
+
border-color: #e5e7eb;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
.flex {
|
| 69 |
+
display: flex;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
.border-transparent {
|
| 73 |
+
border-color: transparent;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
.border-b-2 {
|
| 77 |
+
border-bottom: 2px solid #dee2e6;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.border-lr-2 {
|
| 81 |
+
border-left: 2px solid #dee2e6;
|
| 82 |
+
border-right: 2px solid #dee2e6;
|
| 83 |
+
}
|
| 84 |
+
|
templates/index.html
CHANGED
|
@@ -4,126 +4,230 @@
|
|
| 4 |
<meta charset="UTF-8"/>
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
| 6 |
<title>vits-simple-api</title>
|
| 7 |
-
|
| 8 |
<link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
|
| 9 |
</head>
|
| 10 |
<body>
|
| 11 |
-
<main
|
| 12 |
-
<
|
| 13 |
-
<
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
<div>
|
| 18 |
-
<label>文档:</label>
|
| 19 |
-
<a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
|
| 20 |
-
style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
|
| 21 |
-
</div>
|
| 22 |
-
<div>
|
| 23 |
-
<label>返回speakers(json):</label>
|
| 24 |
-
<a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
|
| 25 |
-
style="text-decoration: none; color: black">
|
| 26 |
-
https://artrajz-vits-simple-api.hf.space/voice/speakers
|
| 27 |
-
</a>
|
| 28 |
-
</div>
|
| 29 |
-
<div>
|
| 30 |
-
<label>简单调用api:</label>
|
| 31 |
-
<a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
|
| 32 |
-
style="text-decoration: none; color: black">
|
| 33 |
-
https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
|
| 34 |
-
</a>
|
| 35 |
-
</div>
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
<option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
|
| 50 |
| {{ speaker["lang"] }}</option>
|
| 51 |
{% else %}
|
| 52 |
<option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
|
| 53 |
| {{ speaker["lang"] }}</option>
|
| 54 |
{% endif %}
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
<
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
</div>
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
</div>
|
| 107 |
-
</
|
| 108 |
</div>
|
| 109 |
-
</div>
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
</div>
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
<div>方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd</div>
|
| 124 |
-
<br/>
|
| 125 |
-
|
| 126 |
-
<h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
|
| 127 |
<p>
|
| 128 |
Nene_Nanami_Rong_Tang:
|
| 129 |
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
|
@@ -164,6 +268,8 @@
|
|
| 164 |
vits_chinese:
|
| 165 |
<a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
|
| 166 |
</p>
|
|
|
|
|
|
|
| 167 |
|
| 168 |
</main>
|
| 169 |
|
|
@@ -171,6 +277,10 @@
|
|
| 171 |
<script src="/static/js/bootstrap.bundle.min.js"></script>
|
| 172 |
|
| 173 |
<script>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
function getProtocol() {
|
| 175 |
return 'https:' == location.protocol ? "https://" : "http://";
|
| 176 |
}
|
|
@@ -181,12 +291,21 @@
|
|
| 181 |
}
|
| 182 |
|
| 183 |
var baseUrl = getProtocol() + getUrl();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
setBaseUrl();
|
| 186 |
|
| 187 |
function setBaseUrl() {
|
| 188 |
-
var text = document.getElementById("inputText").value;
|
| 189 |
-
var id = document.getElementById("inputId").value;
|
| 190 |
|
| 191 |
var vitsLink = document.getElementById("vitsLink");
|
| 192 |
var speakersLink = document.getElementById("speakersLink");
|
|
@@ -202,17 +321,22 @@
|
|
| 202 |
}
|
| 203 |
|
| 204 |
function getLink() {
|
| 205 |
-
var text = document.getElementById("inputText").value;
|
| 206 |
-
var id = document.getElementById("inputId").value;
|
| 207 |
-
var format = document.getElementById("inputFormat").value;
|
| 208 |
-
var lang = document.getElementById("inputLang").value;
|
| 209 |
-
var length = document.getElementById("inputLength").value;
|
| 210 |
-
var noise = document.getElementById("inputNoise").value;
|
| 211 |
-
var noisew = document.getElementById("inputNoisew").value;
|
| 212 |
-
var max = document.getElementById("inputMax").value;
|
| 213 |
-
var streaming = document.getElementById('streaming');
|
| 214 |
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
if (format != "") {
|
| 217 |
url += "&format=" + format;
|
| 218 |
}
|
|
@@ -231,6 +355,7 @@
|
|
| 231 |
if (max != "") {
|
| 232 |
url += "&max=" + max;
|
| 233 |
}
|
|
|
|
| 234 |
if (streaming.checked) {
|
| 235 |
url += '&streaming=true';
|
| 236 |
}
|
|
@@ -245,16 +370,37 @@
|
|
| 245 |
}
|
| 246 |
|
| 247 |
function setAudioSource() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
var url = getLink();
|
| 249 |
-
var audioPlayer = document.getElementById("audioPlayer");
|
| 250 |
audioPlayer.src = url;
|
| 251 |
audioPlayer.play();
|
| 252 |
}
|
| 253 |
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
</script>
|
| 259 |
</body>
|
| 260 |
</html>
|
|
|
|
| 4 |
<meta charset="UTF-8"/>
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
| 6 |
<title>vits-simple-api</title>
|
| 7 |
+
<link rel="stylesheet" href="/static/css/style.css">
|
| 8 |
<link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
|
| 9 |
</head>
|
| 10 |
<body>
|
| 11 |
+
<main class="main-container">
|
| 12 |
+
<div class="container flex flex-wrap mx-auto">
|
| 13 |
+
<div class="text-center d-flex align-items-center w-100" style="height: 100px;" id="component-1">
|
| 14 |
+
<h1 class="w-100">
|
| 15 |
+
<a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
|
| 16 |
+
style="text-decoration: none; color: black"> vits-simple-api </a>
|
| 17 |
+
</h1>
|
| 18 |
+
</div>
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
<div class="tabs w-100 border-b-2" id="component-2">
|
| 22 |
+
<button class="tab-button px-4 pb-2 pt-2 active " onclick="showContent(0)">VITS</button>
|
| 23 |
+
<button class="tab-button px-4 pb-2 pt-2" onclick="showContent(1)">W2V2-VITS</button>
|
| 24 |
+
</div>
|
| 25 |
+
|
| 26 |
+
<div class="content w-100 border-lr-2 border-b-2" id="component-3">
|
| 27 |
+
<div class="content-pane active w-100 flex-wrap">
|
| 28 |
+
<form class="w-100">
|
| 29 |
+
<div class="form-group">
|
| 30 |
+
<label>text</label>
|
| 31 |
+
<textarea class="form-control" id="inputText1" rows="3"
|
| 32 |
+
oninput="updateLink()">你好,こんにちは</textarea>
|
| 33 |
+
</div>
|
| 34 |
+
<div class="form-group">
|
| 35 |
+
<label>id</label>
|
| 36 |
+
<select class="form-control" id="inputId1" oninput="updateLink()">
|
| 37 |
+
{% for speaker in speakers["VITS"] %}
|
| 38 |
+
{% if speaker["name"] == "雷电将军(雷神)" %}
|
| 39 |
<option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
|
| 40 |
| {{ speaker["lang"] }}</option>
|
| 41 |
{% else %}
|
| 42 |
<option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
|
| 43 |
| {{ speaker["lang"] }}</option>
|
| 44 |
{% endif %}
|
| 45 |
+
{% endfor %}
|
| 46 |
+
</select>
|
| 47 |
+
</div>
|
| 48 |
+
</form>
|
| 49 |
+
<form class="w-100">
|
| 50 |
+
<div class="row">
|
| 51 |
+
<div class="col-md-4 form-group">
|
| 52 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 53 |
+
title="默认为wav">format</label>
|
| 54 |
+
<select class="form-control" id="inputFormat1" oninput="updateLink()">
|
| 55 |
+
<option></option>
|
| 56 |
+
<option>wav</option>
|
| 57 |
+
<option>mp3</option>
|
| 58 |
+
<option>ogg</option>
|
| 59 |
+
<option>silk</option>
|
| 60 |
+
</select>
|
| 61 |
+
</div>
|
| 62 |
+
<div class="col-md-4 form-group">
|
| 63 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 64 |
+
title="自动识别语言auto:可识别的语言根据不同speaker而不同,方言无法自动识别。方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd">lang</label>
|
| 65 |
+
<input type="text" class="form-control" id="inputLang1" oninput="updateLink()" value=""
|
| 66 |
+
placeholder="auto"/>
|
| 67 |
+
</div>
|
| 68 |
+
<div class="col-md-4 form-group">
|
| 69 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 70 |
+
title="调节语音长度,相当于调节语速,该数值越大语速越慢。">length</label>
|
| 71 |
+
<input type="number" class="form-control" id="inputLength1" oninput="updateLink()" value=""
|
| 72 |
+
placeholder="1" min="0" step="0.001"/>
|
| 73 |
+
</div>
|
| 74 |
+
</div>
|
| 75 |
+
<div class="row">
|
| 76 |
+
<div class="col-md-4 form-group">
|
| 77 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 78 |
+
title="样本噪声,控制合成的随机性。">noise</label>
|
| 79 |
+
<input type="number" class="form-control" id="inputNoise1" oninput="updateLink()" value=""
|
| 80 |
+
placeholder="0.33" min="0" step="0.001"/>
|
| 81 |
+
</div>
|
| 82 |
+
<div class="col-md-4 form-group">
|
| 83 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 84 |
+
title="随机时长预测器噪声,控制音素发音长度。">noisew</label>
|
| 85 |
+
<input type="number" class="form-control" id="inputNoisew1" oninput="updateLink()" value=""
|
| 86 |
+
placeholder="0.4" min="0" step="0.001"/>
|
| 87 |
+
</div>
|
| 88 |
+
<div class="col-md-4 form-group">
|
| 89 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 90 |
+
title="按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。">max</label>
|
| 91 |
+
<input type="number" class="form-control" id="inputMax1" oninput="updateLink()" value=""
|
| 92 |
+
placeholder="50" step="1"/>
|
| 93 |
+
</div>
|
| 94 |
+
</div>
|
| 95 |
+
</form>
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
<div class="flex flex-wrap w-100"
|
| 99 |
+
style="justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
|
| 100 |
+
<button type="button" class="btn btn-outline-secondary" onclick="setAudioSource()"
|
| 101 |
+
style="margin-right: 10px">
|
| 102 |
+
播放器生成
|
| 103 |
+
</button>
|
| 104 |
+
<audio id="audioPlayer1" controls>
|
| 105 |
+
<source src="" type="audio/mp3"/>
|
| 106 |
+
Your browser does not support the audio element.
|
| 107 |
+
</audio>
|
| 108 |
+
<div class="form-group form-check">
|
| 109 |
+
<input type="checkbox" id="streaming1" onchange="updateLink()">
|
| 110 |
+
<label class="form-check-label" data-toggle="tooltip" data-placement="top"
|
| 111 |
+
title="按照max分段推理文本,推理好一段即输出,无需等待所有文本都推理完毕">流式响应</label>
|
| 112 |
+
</div>
|
| 113 |
</div>
|
| 114 |
+
</div>
|
| 115 |
+
<div class="content-pane">
|
| 116 |
+
<form class="w-100">
|
| 117 |
+
<div class="form-group">
|
| 118 |
+
<label>text</label>
|
| 119 |
+
<textarea class="form-control" id="inputText2" rows="3"
|
| 120 |
+
oninput="updateLink()">你好,こんにちは</textarea>
|
| 121 |
+
</div>
|
| 122 |
+
<div class="form-group">
|
| 123 |
+
<label>id</label>
|
| 124 |
+
<select class="form-control" id="inputId2" oninput="updateLink()">
|
| 125 |
+
{% for speaker in speakers["W2V2-VITS"] %}
|
| 126 |
+
<option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
|
| 127 |
+
| {{ speaker["lang"] }}</option>
|
| 128 |
+
{% endfor %}
|
| 129 |
+
</select>
|
| 130 |
+
</div>
|
| 131 |
+
<div class="form-group mb-3">
|
| 132 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 133 |
+
title="情感嵌入,{% if w2v2_emotion_count > 0 %}
|
| 134 |
+
可输入范围是0-{{ w2v2_emotion_count-1 }}
|
| 135 |
+
{% else %}
|
| 136 |
+
未加载emotion
|
| 137 |
+
{% endif %}">emotion</label>
|
| 138 |
+
<input type="number" class="form-control" min="0" max="{{ w2v2_emotion_count-1 }}" step="1"
|
| 139 |
+
id="emotion" value="0" oninput="updateLink()">
|
| 140 |
+
</div>
|
| 141 |
+
</form>
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
<form class="w-100">
|
| 145 |
+
<div class="row">
|
| 146 |
+
<div class="col-md-4 form-group">
|
| 147 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 148 |
+
title="默认为wav">format</label>
|
| 149 |
+
<select class="form-control" id="inputFormat2" oninput="updateLink()">
|
| 150 |
+
<option></option>
|
| 151 |
+
<option>wav</option>
|
| 152 |
+
<option>mp3</option>
|
| 153 |
+
<option>ogg</option>
|
| 154 |
+
<option>silk</option>
|
| 155 |
+
</select>
|
| 156 |
+
</div>
|
| 157 |
+
<div class="col-md-4 form-group">
|
| 158 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 159 |
+
title="自动识别语言auto:可识别的语言根据不同speaker而不同,方言无法自动识别。方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd">lang</label>
|
| 160 |
+
<input type="text" class="form-control" id="inputLang2" oninput="updateLink()" value=""
|
| 161 |
+
placeholder="auto"/>
|
| 162 |
+
</div>
|
| 163 |
+
<div class="col-md-4 form-group">
|
| 164 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 165 |
+
title="调节语音长度,相当于调节语速,该数值越大语速越慢。">length</label>
|
| 166 |
+
<input type="number" class="form-control" id="inputLength2" oninput="updateLink()" value=""
|
| 167 |
+
placeholder="1" min="0" step="0.001"/>
|
| 168 |
+
</div>
|
| 169 |
+
</div>
|
| 170 |
+
<div class="row">
|
| 171 |
+
<div class="col-md-4 form-group">
|
| 172 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 173 |
+
title="样本噪声,控制合成的随机性。">noise</label>
|
| 174 |
+
<input type="number" class="form-control" id="inputNoise2" oninput="updateLink()" value=""
|
| 175 |
+
placeholder="0.33" min="0" step="0.001"/>
|
| 176 |
+
</div>
|
| 177 |
+
<div class="col-md-4 form-group">
|
| 178 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 179 |
+
title="随机时长预测器噪声,控制音素发音长度。">noisew</label>
|
| 180 |
+
<input type="number" class="form-control" id="inputNoisew2" oninput="updateLink()" value=""
|
| 181 |
+
placeholder="0.4" min="0" step="0.001"/>
|
| 182 |
+
</div>
|
| 183 |
+
<div class="col-md-4 form-group">
|
| 184 |
+
<label data-toggle="tooltip" data-placement="top"
|
| 185 |
+
title="按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。">max</label>
|
| 186 |
+
<input type="number" class="form-control" id="inputMax2" oninput="updateLink()" value=""
|
| 187 |
+
placeholder="50" step="1"/>
|
| 188 |
+
</div>
|
| 189 |
+
</div>
|
| 190 |
+
</form>
|
| 191 |
+
|
| 192 |
+
<div class="flex flex-wrap w-100"
|
| 193 |
+
style="justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
|
| 194 |
+
<button type="button" class="btn btn-outline-secondary" onclick="setAudioSource()"
|
| 195 |
+
style="margin-right: 10px">
|
| 196 |
+
播放器生成
|
| 197 |
+
</button>
|
| 198 |
+
<audio id="audioPlayer2" controls>
|
| 199 |
+
<source src="" type="audio/mp3"/>
|
| 200 |
+
Your browser does not support the audio element.
|
| 201 |
+
</audio>
|
| 202 |
+
<div class="form-group form-check">
|
| 203 |
+
<input type="checkbox" id="streaming2" onchange="updateLink()">
|
| 204 |
+
<label class="form-check-label">流式响应</label>
|
| 205 |
+
</div>
|
| 206 |
</div>
|
| 207 |
+
</div>
|
| 208 |
</div>
|
|
|
|
| 209 |
|
| 210 |
+
<div class="mt-2">
|
| 211 |
+
{% if speakers_count == 0 %}
|
| 212 |
+
<div style="color: red;">未加载任何模型</div>
|
| 213 |
+
{% endif %}
|
| 214 |
+
<div>
|
| 215 |
+
<label>返回speakers(json):</label>
|
| 216 |
+
<a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
|
| 217 |
+
style="text-decoration: none; color: black">
|
| 218 |
+
https://artrajz-vits-simple-api.hf.space/voice/speakers
|
| 219 |
+
</a>
|
| 220 |
+
</div>
|
| 221 |
+
<div>
|
| 222 |
+
<label>API调用:</label>
|
| 223 |
+
<a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
|
| 224 |
+
style="text-decoration: none; color: black">
|
| 225 |
+
https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
|
| 226 |
+
</a>
|
| 227 |
+
</div>
|
| 228 |
</div>
|
| 229 |
+
<h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
|
| 230 |
+
<h2>请严格遵循模型原作者使用协议!</h2>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
<p>
|
| 232 |
Nene_Nanami_Rong_Tang:
|
| 233 |
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
|
|
|
| 268 |
vits_chinese:
|
| 269 |
<a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
|
| 270 |
</p>
|
| 271 |
+
</div>
|
| 272 |
+
<br/>
|
| 273 |
|
| 274 |
</main>
|
| 275 |
|
|
|
|
| 277 |
<script src="/static/js/bootstrap.bundle.min.js"></script>
|
| 278 |
|
| 279 |
<script>
|
| 280 |
+
$(function () {
|
| 281 |
+
$('[data-toggle="tooltip"]').tooltip()
|
| 282 |
+
})
|
| 283 |
+
|
| 284 |
function getProtocol() {
|
| 285 |
return 'https:' == location.protocol ? "https://" : "http://";
|
| 286 |
}
|
|
|
|
| 291 |
}
|
| 292 |
|
| 293 |
var baseUrl = getProtocol() + getUrl();
|
| 294 |
+
var modelType = 1;
|
| 295 |
+
var vitsStatus = false;
|
| 296 |
+
var w2v2Status = false;
|
| 297 |
+
{% if vits_speakers_count > 0 %}
|
| 298 |
+
vitsStatus = true;
|
| 299 |
+
{% endif %}
|
| 300 |
+
{% if w2v2_speakers_count > 0 %}
|
| 301 |
+
w2v2Status = true;
|
| 302 |
+
{% endif %}
|
| 303 |
|
| 304 |
setBaseUrl();
|
| 305 |
|
| 306 |
function setBaseUrl() {
|
| 307 |
+
var text = document.getElementById("inputText" + modelType).value;
|
| 308 |
+
var id = document.getElementById("inputId" + modelType).value;
|
| 309 |
|
| 310 |
var vitsLink = document.getElementById("vitsLink");
|
| 311 |
var speakersLink = document.getElementById("speakersLink");
|
|
|
|
| 321 |
}
|
| 322 |
|
| 323 |
function getLink() {
|
| 324 |
+
var text = document.getElementById("inputText" + modelType).value;
|
| 325 |
+
var id = document.getElementById("inputId" + modelType).value;
|
| 326 |
+
var format = document.getElementById("inputFormat" + modelType).value;
|
| 327 |
+
var lang = document.getElementById("inputLang" + modelType).value;
|
| 328 |
+
var length = document.getElementById("inputLength" + modelType).value;
|
| 329 |
+
var noise = document.getElementById("inputNoise" + modelType).value;
|
| 330 |
+
var noisew = document.getElementById("inputNoisew" + modelType).value;
|
| 331 |
+
var max = document.getElementById("inputMax" + modelType).value;
|
| 332 |
+
var streaming = document.getElementById('streaming' + modelType);
|
| 333 |
|
| 334 |
+
if (modelType == 1) {
|
| 335 |
+
var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
|
| 336 |
+
} else if (modelType == 2) {
|
| 337 |
+
var emotion = document.getElementById('emotion').value;
|
| 338 |
+
var url = baseUrl + "/voice/w2v2-vits?text=" + text + "&id=" + id + "&emotion=" + emotion;
|
| 339 |
+
}
|
| 340 |
if (format != "") {
|
| 341 |
url += "&format=" + format;
|
| 342 |
}
|
|
|
|
| 355 |
if (max != "") {
|
| 356 |
url += "&max=" + max;
|
| 357 |
}
|
| 358 |
+
|
| 359 |
if (streaming.checked) {
|
| 360 |
url += '&streaming=true';
|
| 361 |
}
|
|
|
|
| 370 |
}
|
| 371 |
|
| 372 |
function setAudioSource() {
|
| 373 |
+
if (modelType==1 && !vitsStatus){
|
| 374 |
+
alert("未加载VITS模型");
|
| 375 |
+
return;
|
| 376 |
+
}
|
| 377 |
+
if (modelType==2 && !w2v2Status){
|
| 378 |
+
alert("未加载W2V2-VITS模型");
|
| 379 |
+
return;
|
| 380 |
+
}
|
| 381 |
var url = getLink();
|
| 382 |
+
var audioPlayer = document.getElementById("audioPlayer" + modelType);
|
| 383 |
audioPlayer.src = url;
|
| 384 |
audioPlayer.play();
|
| 385 |
}
|
| 386 |
|
| 387 |
+
function showContent(index) {
|
| 388 |
+
const panes = document.querySelectorAll(".content-pane");
|
| 389 |
+
const buttons = document.querySelectorAll(".tab-button");
|
| 390 |
+
modelType = index + 1;
|
| 391 |
+
|
| 392 |
+
for (let i = 0; i < panes.length; i++) {
|
| 393 |
+
if (i === index) {
|
| 394 |
+
panes[i].classList.add("active");
|
| 395 |
+
buttons[i].classList.add("active");
|
| 396 |
+
|
| 397 |
+
} else {
|
| 398 |
+
panes[i].classList.remove("active");
|
| 399 |
+
buttons[i].classList.remove("active");
|
| 400 |
+
}
|
| 401 |
+
}
|
| 402 |
+
updateLink();
|
| 403 |
+
}
|
| 404 |
</script>
|
| 405 |
</body>
|
| 406 |
</html>
|
text/cleaners.py
CHANGED
|
@@ -186,6 +186,21 @@ def cjke_cleaners2(text):
|
|
| 186 |
|
| 187 |
|
| 188 |
def cje_cleaners(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
from text.mandarin import chinese_to_ipa
|
| 190 |
from text.japanese import japanese_to_ipa2
|
| 191 |
from text.english import english_to_ipa2
|
|
|
|
| 186 |
|
| 187 |
|
| 188 |
def cje_cleaners(text):
|
| 189 |
+
from text.mandarin import chinese_to_lazy_ipa
|
| 190 |
+
from text.japanese import japanese_to_ipa
|
| 191 |
+
from text.english import english_to_ipa2
|
| 192 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
| 193 |
+
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
|
| 194 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
| 195 |
+
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
|
| 196 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
| 197 |
+
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
|
| 198 |
+
text = re.sub(r'\s+$', '', text)
|
| 199 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 200 |
+
return text
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def cje_cleaners2(text):
|
| 204 |
from text.mandarin import chinese_to_ipa
|
| 205 |
from text.japanese import japanese_to_ipa2
|
| 206 |
from text.english import english_to_ipa2
|
text/mandarin.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
import
|
| 2 |
-
import sys
|
| 3 |
import re
|
| 4 |
from pypinyin import lazy_pinyin, BOPOMOFO
|
| 5 |
import jieba
|
|
@@ -7,7 +6,7 @@ import cn2an
|
|
| 7 |
import logging
|
| 8 |
|
| 9 |
logging.getLogger('jieba').setLevel(logging.WARNING)
|
| 10 |
-
jieba.set_dictionary(
|
| 11 |
jieba.initialize()
|
| 12 |
|
| 13 |
# List of (Latin alphabet, bopomofo) pairs:
|
|
|
|
| 1 |
+
import config
|
|
|
|
| 2 |
import re
|
| 3 |
from pypinyin import lazy_pinyin, BOPOMOFO
|
| 4 |
import jieba
|
|
|
|
| 6 |
import logging
|
| 7 |
|
| 8 |
logging.getLogger('jieba').setLevel(logging.WARNING)
|
| 9 |
+
jieba.set_dictionary(config.ABS_PATH + '/jieba/dict.txt')
|
| 10 |
jieba.initialize()
|
| 11 |
|
| 12 |
# List of (Latin alphabet, bopomofo) pairs:
|
utils/merge.py
CHANGED
|
@@ -19,12 +19,13 @@ lang_dict = {
|
|
| 19 |
"cjke_cleaners": ["zh", "ja", "ko", "en"],
|
| 20 |
"cjke_cleaners2": ["zh", "ja", "ko", "en"],
|
| 21 |
"cje_cleaners": ["zh", "ja", "en"],
|
|
|
|
| 22 |
"thai_cleaners": ["th"],
|
| 23 |
"shanghainese_cleaners": ["sh"],
|
| 24 |
"chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
|
| 25 |
"ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
|
| 26 |
"YB"],
|
| 27 |
-
"bert_chinese_cleaners":["zh"],
|
| 28 |
}
|
| 29 |
|
| 30 |
|
|
@@ -109,11 +110,16 @@ def merge_model(merging_model):
|
|
| 109 |
for obj_id, i in enumerate(vits_list):
|
| 110 |
obj = vits(model=i[0], config=i[1], model_type="vits")
|
| 111 |
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
# merge hubert-vits
|
| 119 |
if len(hubert_vits_list) != 0:
|
|
@@ -136,6 +142,7 @@ def merge_model(merging_model):
|
|
| 136 |
new_id += 1
|
| 137 |
|
| 138 |
# merge w2v2-vits
|
|
|
|
| 139 |
if len(w2v2_vits_list) != 0:
|
| 140 |
if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
|
| 141 |
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
|
|
@@ -156,7 +163,8 @@ def merge_model(merging_model):
|
|
| 156 |
|
| 157 |
voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
|
| 158 |
voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
|
| 159 |
-
|
| 160 |
-
|
|
|
|
| 161 |
|
| 162 |
return tts
|
|
|
|
| 19 |
"cjke_cleaners": ["zh", "ja", "ko", "en"],
|
| 20 |
"cjke_cleaners2": ["zh", "ja", "ko", "en"],
|
| 21 |
"cje_cleaners": ["zh", "ja", "en"],
|
| 22 |
+
"cje_cleaners2": ["zh", "ja", "en"],
|
| 23 |
"thai_cleaners": ["th"],
|
| 24 |
"shanghainese_cleaners": ["sh"],
|
| 25 |
"chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
|
| 26 |
"ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
|
| 27 |
"YB"],
|
| 28 |
+
"bert_chinese_cleaners": ["zh"],
|
| 29 |
}
|
| 30 |
|
| 31 |
|
|
|
|
| 110 |
for obj_id, i in enumerate(vits_list):
|
| 111 |
obj = vits(model=i[0], config=i[1], model_type="vits")
|
| 112 |
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
|
| 113 |
+
if isinstance(obj.get_speakers(), list):
|
| 114 |
+
for id, name in enumerate(obj.get_speakers()):
|
| 115 |
+
vits_obj.append([int(id), obj, obj_id])
|
| 116 |
+
vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
| 117 |
+
new_id += 1
|
| 118 |
+
else:
|
| 119 |
+
for id, (name, _) in enumerate(obj.get_speakers().items()):
|
| 120 |
+
vits_obj.append([int(id), obj, obj_id])
|
| 121 |
+
vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
| 122 |
+
new_id += 1
|
| 123 |
|
| 124 |
# merge hubert-vits
|
| 125 |
if len(hubert_vits_list) != 0:
|
|
|
|
| 142 |
new_id += 1
|
| 143 |
|
| 144 |
# merge w2v2-vits
|
| 145 |
+
emotion_reference = None
|
| 146 |
if len(w2v2_vits_list) != 0:
|
| 147 |
if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
|
| 148 |
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
|
|
|
|
| 163 |
|
| 164 |
voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
|
| 165 |
voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
|
| 166 |
+
w2v2_emotion_count = len(emotion_reference) if emotion_reference is not None else 0
|
| 167 |
+
|
| 168 |
+
tts = TTS(voice_obj, voice_speakers, w2v2_emotion_count=w2v2_emotion_count)
|
| 169 |
|
| 170 |
return tts
|
utils/nlp.py
CHANGED
|
@@ -1,13 +1,7 @@
|
|
| 1 |
import regex as re
|
| 2 |
-
import logging
|
| 3 |
import config
|
| 4 |
from .utils import check_is_none
|
| 5 |
-
|
| 6 |
-
logger = logging.getLogger("vits-simple-api")
|
| 7 |
-
level = getattr(config, "LOGGING_LEVEL", "DEBUG")
|
| 8 |
-
level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
|
| 9 |
-
'CRITICAL': logging.CRITICAL}
|
| 10 |
-
logger.setLevel(level_dict[level])
|
| 11 |
|
| 12 |
|
| 13 |
def clasify_lang(text, speaker_lang):
|
|
|
|
| 1 |
import regex as re
|
|
|
|
| 2 |
import config
|
| 3 |
from .utils import check_is_none
|
| 4 |
+
from logger import logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def clasify_lang(text, speaker_lang):
|
vits-simple-api-installer-latest.sh
CHANGED
|
@@ -12,7 +12,32 @@ if [ ! -f config.py ]; then
|
|
| 12 |
wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
|
| 13 |
fi
|
| 14 |
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
|
| 18 |
|
|
|
|
| 12 |
wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
|
| 13 |
fi
|
| 14 |
|
| 15 |
+
if [ ! -f gunicorn_config.py ]; then
|
| 16 |
+
echo -e "${YELLOW}download config.py\n${PLAIN}"
|
| 17 |
+
wget -O $INSTALL_DIR/gunicorn_config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/gunicorn_config.py
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
while true; do
|
| 21 |
+
echo -e "${GREEN}Which version of docker-compose.yaml do you want to download?"
|
| 22 |
+
echo -e "1. docker-compose.yaml (CPU version)"
|
| 23 |
+
echo -e "2. docker-compose-gpu.yaml (GPU version)"
|
| 24 |
+
read -p "Enter your choice (1 or 2): " choice
|
| 25 |
+
case $choice in
|
| 26 |
+
1)
|
| 27 |
+
echo -e "${YELLOW}Downloading docker-compose.yaml (CPU version)\n${PLAIN}"
|
| 28 |
+
wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose.yaml
|
| 29 |
+
break
|
| 30 |
+
;;
|
| 31 |
+
2)
|
| 32 |
+
echo -e "${YELLOW}Downloading docker-compose-gpu.yaml (GPU version)\n${PLAIN}"
|
| 33 |
+
wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose-gpu.yaml
|
| 34 |
+
break
|
| 35 |
+
;;
|
| 36 |
+
*)
|
| 37 |
+
echo -e "${RED}Invalid choice. Please enter 1 or 2.${PLAIN}"
|
| 38 |
+
;;
|
| 39 |
+
esac
|
| 40 |
+
done
|
| 41 |
|
| 42 |
echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
|
| 43 |
|
voice.py
CHANGED
|
@@ -6,7 +6,6 @@ import numpy as np
|
|
| 6 |
import torch
|
| 7 |
import xml.etree.ElementTree as ET
|
| 8 |
import config
|
| 9 |
-
import logging
|
| 10 |
import soundfile as sf
|
| 11 |
from torch import no_grad, LongTensor, inference_mode, FloatTensor
|
| 12 |
from io import BytesIO
|
|
@@ -16,6 +15,7 @@ from mel_processing import spectrogram_torch
|
|
| 16 |
from text import text_to_sequence
|
| 17 |
from models import SynthesizerTrn
|
| 18 |
from utils import utils
|
|
|
|
| 19 |
|
| 20 |
# torch.set_num_threads(1) # 设置torch线程为1
|
| 21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
@@ -251,7 +251,7 @@ class vits:
|
|
| 251 |
|
| 252 |
|
| 253 |
class TTS:
|
| 254 |
-
def __init__(self, voice_obj, voice_speakers):
|
| 255 |
self._voice_obj = voice_obj
|
| 256 |
self._voice_speakers = voice_speakers
|
| 257 |
self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
|
|
@@ -259,10 +259,11 @@ class TTS:
|
|
| 259 |
self._vits_speakers_count = len(self._voice_speakers["VITS"])
|
| 260 |
self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
|
| 261 |
self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
|
|
|
|
| 262 |
self.dem = None
|
| 263 |
|
| 264 |
# Initialization information
|
| 265 |
-
self.logger =
|
| 266 |
self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
|
| 267 |
self.logger.info(f'device:{device} device.type:{device.type}')
|
| 268 |
|
|
@@ -420,9 +421,7 @@ class TTS:
|
|
| 420 |
|
| 421 |
return voice_tasks, format
|
| 422 |
|
| 423 |
-
def create_ssml_infer_task(self,
|
| 424 |
-
voice_tasks, format = self.parse_ssml(ssml)
|
| 425 |
-
|
| 426 |
audios = []
|
| 427 |
for voice in voice_tasks:
|
| 428 |
if voice.get("break"):
|
|
@@ -438,10 +437,10 @@ class TTS:
|
|
| 438 |
|
| 439 |
audio = np.concatenate(audios, axis=0)
|
| 440 |
encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
| 441 |
-
if config
|
| 442 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 443 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 444 |
-
return encoded_audio
|
| 445 |
|
| 446 |
def vits_infer(self, voice, fname):
|
| 447 |
format = voice.get("format", "wav")
|
|
@@ -450,7 +449,7 @@ class TTS:
|
|
| 450 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 451 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 452 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 453 |
-
if config
|
| 454 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 455 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 456 |
return encoded_audio
|
|
@@ -466,9 +465,9 @@ class TTS:
|
|
| 466 |
encoded_audio = self.encode(sampling_rate, chunk, format)
|
| 467 |
for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
|
| 468 |
yield encoded_audio_chunk
|
| 469 |
-
if config
|
| 470 |
-
audio.write(encoded_audio.getvalue())
|
| 471 |
-
if config
|
| 472 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 473 |
utils.save_audio(audio.getvalue(), path)
|
| 474 |
|
|
@@ -479,7 +478,7 @@ class TTS:
|
|
| 479 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 480 |
audio = voice_obj.get_audio(voice)
|
| 481 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 482 |
-
if config
|
| 483 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 484 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 485 |
return encoded_audio
|
|
@@ -491,7 +490,7 @@ class TTS:
|
|
| 491 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 492 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 493 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 494 |
-
if config
|
| 495 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 496 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 497 |
return encoded_audio
|
|
@@ -515,7 +514,7 @@ class TTS:
|
|
| 515 |
|
| 516 |
audio = voice_obj.voice_conversion(voice)
|
| 517 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 518 |
-
if config
|
| 519 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 520 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 521 |
return encoded_audio
|
|
|
|
| 6 |
import torch
|
| 7 |
import xml.etree.ElementTree as ET
|
| 8 |
import config
|
|
|
|
| 9 |
import soundfile as sf
|
| 10 |
from torch import no_grad, LongTensor, inference_mode, FloatTensor
|
| 11 |
from io import BytesIO
|
|
|
|
| 15 |
from text import text_to_sequence
|
| 16 |
from models import SynthesizerTrn
|
| 17 |
from utils import utils
|
| 18 |
+
from logger import logger
|
| 19 |
|
| 20 |
# torch.set_num_threads(1) # 设置torch线程为1
|
| 21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 251 |
|
| 252 |
|
| 253 |
class TTS:
|
| 254 |
+
def __init__(self, voice_obj, voice_speakers, w2v2_emotion_count=0):
|
| 255 |
self._voice_obj = voice_obj
|
| 256 |
self._voice_speakers = voice_speakers
|
| 257 |
self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
|
|
|
|
| 259 |
self._vits_speakers_count = len(self._voice_speakers["VITS"])
|
| 260 |
self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
|
| 261 |
self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
|
| 262 |
+
self._w2v2_emotion_count = w2v2_emotion_count
|
| 263 |
self.dem = None
|
| 264 |
|
| 265 |
# Initialization information
|
| 266 |
+
self.logger = logger
|
| 267 |
self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
|
| 268 |
self.logger.info(f'device:{device} device.type:{device.type}')
|
| 269 |
|
|
|
|
| 421 |
|
| 422 |
return voice_tasks, format
|
| 423 |
|
| 424 |
+
def create_ssml_infer_task(self, voice_tasks, format, fname):
|
|
|
|
|
|
|
| 425 |
audios = []
|
| 426 |
for voice in voice_tasks:
|
| 427 |
if voice.get("break"):
|
|
|
|
| 437 |
|
| 438 |
audio = np.concatenate(audios, axis=0)
|
| 439 |
encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
| 440 |
+
if getattr(config, "SAVE_AUDIO", False):
|
| 441 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 442 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 443 |
+
return encoded_audio
|
| 444 |
|
| 445 |
def vits_infer(self, voice, fname):
|
| 446 |
format = voice.get("format", "wav")
|
|
|
|
| 449 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 450 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 451 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 452 |
+
if getattr(config, "SAVE_AUDIO", False):
|
| 453 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 454 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 455 |
return encoded_audio
|
|
|
|
| 465 |
encoded_audio = self.encode(sampling_rate, chunk, format)
|
| 466 |
for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
|
| 467 |
yield encoded_audio_chunk
|
| 468 |
+
if getattr(config, "SAVE_AUDIO", False):
|
| 469 |
+
audio.write(encoded_audio.getvalue())
|
| 470 |
+
if getattr(config, "SAVE_AUDIO", False):
|
| 471 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 472 |
utils.save_audio(audio.getvalue(), path)
|
| 473 |
|
|
|
|
| 478 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 479 |
audio = voice_obj.get_audio(voice)
|
| 480 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 481 |
+
if getattr(config, "SAVE_AUDIO", False):
|
| 482 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 483 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 484 |
return encoded_audio
|
|
|
|
| 490 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 491 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 492 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 493 |
+
if getattr(config, "SAVE_AUDIO", False):
|
| 494 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 495 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 496 |
return encoded_audio
|
|
|
|
| 514 |
|
| 515 |
audio = voice_obj.voice_conversion(voice)
|
| 516 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 517 |
+
if getattr(config, "SAVE_AUDIO", False):
|
| 518 |
path = f"{config.CACHE_PATH}/{fname}"
|
| 519 |
utils.save_audio(encoded_audio.getvalue(), path)
|
| 520 |
return encoded_audio
|